materialize_synthetic_multihot_dataset.py

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os
import pathlib
import shutil
import sys

import numpy as np
import torch
from torch import distributed as dist, nn
from torchrec.datasets.criteo import DAYS

p = pathlib.Path(__file__).absolute().parents[1].resolve()
sys.path.append(os.fspath(p))

# OSS import
try:
    # pyre-ignore[21]
    # @manual=//ai_codesign/benchmarks/dlrm/torchrec_dlrm:multi_hot
    from multi_hot import Multihot
except ImportError:
    pass

# internal import
try:
    from .multi_hot import Multihot  # noqa F811
except ImportError:
    pass


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Script to materialize synthetic multi-hot dataset into NumPy npz file format."
    )
    parser.add_argument(
        "--in_memory_binary_criteo_path",
        type=str,
        required=True,
        help="Path to a folder containing the binary (npy) files for the Criteo dataset."
        " When supplied, InMemoryBinaryCriteoIterDataPipe is used.",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        required=True,
        help="Path to outputted multi-hot sparse dataset.",
    )
    parser.add_argument(
        "--copy_labels_and_dense",
        dest="copy_labels_and_dense",
        action="store_true",
        help="Flag to determine whether to copy labels and dense data to the output directory.",
    )
    parser.add_argument(
        "--num_embeddings_per_feature",
        type=str,
        required=True,
        help="Comma separated max_ind_size per sparse feature. The number of embeddings"
        " in each embedding table. 26 values are expected for the Criteo dataset.",
    )
    parser.add_argument(
        "--multi_hot_sizes",
        type=str,
        required=True,
        help="Comma separated multihot size per sparse feature. 26 values are expected for the Criteo dataset.",
    )
    parser.add_argument(
        "--multi_hot_distribution_type",
        type=str,
        choices=["uniform", "pareto"],
        default="uniform",
        help="Multi-hot distribution options.",
    )
    return parser.parse_args()


def main() -> None:
    """
    This script generates and saves the MLPerf v2 multi-hot dataset (4 TB in size).
    First, run process_Criteo_1TB_Click_Logs_dataset.sh.
    Then, run this script as follows:

        python materialize_synthetic_multihot_dataset.py \
            --in_memory_binary_criteo_path $PREPROCESSED_CRITEO_1TB_CLICK_LOGS_DATASET_PATH \
            --output_path $MATERIALIZED_DATASET_PATH \
            --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \
            --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 \
            --multi_hot_distribution_type uniform

    This script takes about 2 hours to run (can be parallelized if needed).
    """
    args = parse_args()
    for name, val in vars(args).items():
        try:
            vars(args)[name] = list(map(int, val.split(",")))
        except (ValueError, AttributeError):
            pass
    try:
        backend = "nccl" if torch.cuda.is_available() else "gloo"
        if not dist.is_initialized():
            dist.init_process_group(backend=backend)
        rank = dist.get_rank()
        world_size = dist.get_world_size()
    except (KeyError, ValueError):
        rank = 0
        world_size = 1

    print("Generating one-hot to multi-hot lookup table.")
    multihot = Multihot(
        multi_hot_sizes=args.multi_hot_sizes,
        num_embeddings_per_feature=args.num_embeddings_per_feature,
        batch_size=1,  # Doesn't matter
        collect_freqs_stats=False,
        dist_type=args.multi_hot_distribution_type,
    )

    os.makedirs(args.output_path, exist_ok=True)

    for i in range(rank, DAYS, world_size):
        input_file_path = os.path.join(
            args.in_memory_binary_criteo_path, f"day_{i}_sparse.npy"
        )
        print(f"Materializing {input_file_path}")
        sparse_data = np.load(input_file_path, mmap_mode="r")
        multi_hot_ids_dict = {}
        for j, (multi_hot_table, hash) in enumerate(
            zip(multihot.multi_hot_tables_l, args.num_embeddings_per_feature)
        ):
            sparse_tensor = torch.from_numpy(sparse_data[:, j] % hash)
            multi_hot_ids_dict[str(j)] = nn.functional.embedding(
                sparse_tensor, multi_hot_table
            ).numpy()
        output_file_path = os.path.join(
            args.output_path, f"day_{i}_sparse_multi_hot.npz"
        )
        np.savez(output_file_path, **multi_hot_ids_dict)
        if args.copy_labels_and_dense:
            for part in ["labels", "dense"]:
                source_path = os.path.join(
                    args.in_memory_binary_criteo_path, f"day_{i}_{part}.npy"
                )
                output_path = os.path.join(args.output_path, f"day_{i}_{part}.npy")
                shutil.copyfile(source_path, output_path)
                print(f"Copying {source_path} to {output_path}")


if __name__ == "__main__":
    main()