materialize_synthetic_multihot_dataset.py 5.23 KB
Newer Older
xinghao's avatar
xinghao committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os
import pathlib
import shutil
import sys

import numpy as np
import torch
from torch import distributed as dist, nn
from torchrec.datasets.criteo import DAYS

p = pathlib.Path(__file__).absolute().parents[1].resolve()
sys.path.append(os.fspath(p))

# OSS import
try:
    # pyre-ignore[21]
    # @manual=//ai_codesign/benchmarks/dlrm/torchrec_dlrm:multi_hot
    from multi_hot import Multihot
except ImportError:
    pass

# internal import
try:
    from .multi_hot import Multihot  # noqa F811
except ImportError:
    pass


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Script to materialize synthetic multi-hot dataset into NumPy npz file format."
    )
    parser.add_argument(
        "--in_memory_binary_criteo_path",
        type=str,
        required=True,
        help="Path to a folder containing the binary (npy) files for the Criteo dataset."
        " When supplied, InMemoryBinaryCriteoIterDataPipe is used.",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        required=True,
        help="Path to outputted multi-hot sparse dataset.",
    )
    parser.add_argument(
        "--copy_labels_and_dense",
        dest="copy_labels_and_dense",
        action="store_true",
        help="Flag to determine whether to copy labels and dense data to the output directory.",
    )
    parser.add_argument(
        "--num_embeddings_per_feature",
        type=str,
        required=True,
        help="Comma separated max_ind_size per sparse feature. The number of embeddings"
        " in each embedding table. 26 values are expected for the Criteo dataset.",
    )
    parser.add_argument(
        "--multi_hot_sizes",
        type=str,
        required=True,
        help="Comma separated multihot size per sparse feature. 26 values are expected for the Criteo dataset.",
    )
    parser.add_argument(
        "--multi_hot_distribution_type",
        type=str,
        choices=["uniform", "pareto"],
        default="uniform",
        help="Multi-hot distribution options.",
    )
    return parser.parse_args()


def main() -> None:
    """
    This script generates and saves the MLPerf v2 multi-hot dataset (4 TB in size).
    First, run process_Criteo_1TB_Click_Logs_dataset.sh.
    Then, run this script as follows:

        python materialize_synthetic_multihot_dataset.py \
            --in_memory_binary_criteo_path $PREPROCESSED_CRITEO_1TB_CLICK_LOGS_DATASET_PATH \
            --output_path $MATERIALIZED_DATASET_PATH \
            --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \
            --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 \
            --multi_hot_distribution_type uniform

    This script takes about 2 hours to run (can be parallelized if needed).
    """
    args = parse_args()
    for name, val in vars(args).items():
        try:
            vars(args)[name] = list(map(int, val.split(",")))
        except (ValueError, AttributeError):
            pass
    try:
        backend = "nccl" if torch.cuda.is_available() else "gloo"
        if not dist.is_initialized():
            dist.init_process_group(backend=backend)
        rank = dist.get_rank()
        world_size = dist.get_world_size()
    except (KeyError, ValueError):
        rank = 0
        world_size = 1

    print("Generating one-hot to multi-hot lookup table.")
    multihot = Multihot(
        multi_hot_sizes=args.multi_hot_sizes,
        num_embeddings_per_feature=args.num_embeddings_per_feature,
        batch_size=1,  # Doesn't matter
        collect_freqs_stats=False,
        dist_type=args.multi_hot_distribution_type,
    )

    os.makedirs(args.output_path, exist_ok=True)

    for i in range(rank, DAYS, world_size):
        input_file_path = os.path.join(
            args.in_memory_binary_criteo_path, f"day_{i}_sparse.npy"
        )
        print(f"Materializing {input_file_path}")
        sparse_data = np.load(input_file_path, mmap_mode="r")
        multi_hot_ids_dict = {}
        for j, (multi_hot_table, hash) in enumerate(
            zip(multihot.multi_hot_tables_l, args.num_embeddings_per_feature)
        ):
            sparse_tensor = torch.from_numpy(sparse_data[:, j] % hash)
            multi_hot_ids_dict[str(j)] = nn.functional.embedding(
                sparse_tensor, multi_hot_table
            ).numpy()
        output_file_path = os.path.join(
            args.output_path, f"day_{i}_sparse_multi_hot.npz"
        )
        np.savez(output_file_path, **multi_hot_ids_dict)
        if args.copy_labels_and_dense:
            for part in ["labels", "dense"]:
                source_path = os.path.join(
                    args.in_memory_binary_criteo_path, f"day_{i}_{part}.npy"
                )
                output_path = os.path.join(args.output_path, f"day_{i}_{part}.npy")
                shutil.copyfile(source_path, output_path)
                print(f"Copying {source_path} to {output_path}")


if __name__ == "__main__":
    main()