Initial commit

9c8a2a14 · xinghao · 9c8a2a14 · 9c8a2a14 · 9c8a2a14 · 9c8a2a14
Commit 9c8a2a14 authored Oct 21, 2025 by xinghao
20 changed files
--- a/images/image2.png
+++ b/images/image2.png
--- a/images/image3.png
+++ b/images/image3.png
--- a/kaggle_dac_loss_accuracy_plots.png
+++ b/kaggle_dac_loss_accuracy_plots.png
--- a/mlperf_logger.py
+++ b/mlperf_logger.py
+# @lint-ignore-every LICENSELINT
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Utilities for MLPerf logging
+"""
+import os
+import torch
+try:
+    from mlperf_logging import mllog
+    from mlperf_logging.mllog import constants
+    _MLLOGGER = mllog.get_mllogger()
+except ImportError as error:
+    print("Unable to import mlperf_logging, ", error)
+def log_start(*args, **kwargs):
+    "log with start tag"
+    _log_print(_MLLOGGER.start, *args, **kwargs)
+def log_end(*args, **kwargs):
+    "log with end tag"
+    _log_print(_MLLOGGER.end, *args, **kwargs)
+def log_event(*args, **kwargs):
+    "log with event tag"
+    _log_print(_MLLOGGER.event, *args, **kwargs)
+def _log_print(logger, *args, **kwargs):
+    "makes mlperf logger aware of distributed execution"
+    if "stack_offset" not in kwargs:
+        kwargs["stack_offset"] = 3
+    if "value" not in kwargs:
+        kwargs["value"] = None
+    if kwargs.pop("log_all_ranks", False):
+        log = True
+    else:
+        log = get_rank() == 0
+    if log:
+        logger(*args, **kwargs)
+def config_logger(benchmark):
+    "initiates mlperf logger"
+    mllog.config(
+        filename=os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), f"{benchmark}.log"
+        )
+    )
+    _MLLOGGER.logger.propagate = False
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+def mlperf_submission_log(benchmark):
+    """
+    Logs information needed for MLPerf submission
+    """
+    config_logger(benchmark)
+    log_event(
+        key=constants.SUBMISSION_BENCHMARK,
+        value=benchmark,
+    )
+    log_event(key=constants.SUBMISSION_ORG, value="reference_implementation")
+    log_event(key=constants.SUBMISSION_DIVISION, value="closed")
+    log_event(key=constants.SUBMISSION_STATUS, value="onprem")
+    log_event(key=constants.SUBMISSION_PLATFORM, value="reference_implementation")
+    log_event(key=constants.SUBMISSION_ENTRY, value="reference_implementation")
+    log_event(key=constants.SUBMISSION_POC_NAME, value="reference_implementation")
+    log_event(key=constants.SUBMISSION_POC_EMAIL, value="reference_implementation")
--- a/optim/rwsadagrad.py
+++ b/optim/rwsadagrad.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.optim import Optimizer
+class RWSAdagrad(Optimizer):
+    """Implements Row Wise Sparse Adagrad algorithm.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lr_decay (float, optional): learning rate decay (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-10)
+    """
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lr_decay=0.0,
+        weight_decay=0.0,
+        initial_accumulator_value=0.0,
+        eps=1e-10,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= lr_decay:
+            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= initial_accumulator_value:
+            raise ValueError(
+                "Invalid initial_accumulator_value value: {}".format(
+                    initial_accumulator_value
+                )
+            )
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        self.defaults = dict(
+            lr=lr,
+            lr_decay=lr_decay,
+            eps=eps,
+            weight_decay=weight_decay,
+            initial_accumulator_value=initial_accumulator_value,
+        )
+        super(RWSAdagrad, self).__init__(params, self.defaults)
+        self.momentum_initialized = False
+        for group in self.param_groups:
+            for p in group["params"]:
+                self.state[p]["step"] = 0
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group["params"]:
+                state = self.state[p]
+                if p.grad.data.is_sparse:
+                    state["momentum"].share_memory_()
+                else:
+                    state["sum"].share_memory_()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                if not self.momentum_initialized:
+                    if p.grad.data.is_sparse:
+                        self.state[p]["momentum"] = torch.full(
+                            [p.data.shape[0]],
+                            self.defaults["initial_accumulator_value"],
+                            dtype=torch.float32,
+                        )
+                    else:
+                        self.state[p]["sum"] = torch.full_like(
+                            p.data,
+                            self.defaults["initial_accumulator_value"],
+                            dtype=torch.float32,
+                        )
+                grad = p.grad
+                state = self.state[p]
+                state["step"] += 1
+                if group["weight_decay"] != 0:
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError(
+                            "weight_decay option is not compatible with sparse gradients"
+                        )
+                    grad = grad.add(group["weight_decay"], p.data)
+                clr = group["lr"] / (1.0 + (state["step"] - 1.0) * group["lr_decay"])
+                if grad.is_sparse:
+                    grad = (
+                        grad.coalesce()
+                    )  # the update is non-linear so indices must be unique
+                    grad_indices = grad._indices()
+                    grad_values = grad._values()
+                    size = grad.size()
+                    def make_sparse(values, row_wise):
+                        constructor = grad.new
+                        matrix_size = [size[0]] if row_wise else size
+                        return constructor(grad_indices, values, matrix_size)
+                    if grad_values.numel() > 0:
+                        momentum_update = make_sparse(
+                            grad_values.pow(2).mean(dim=1), True
+                        )
+                        state["momentum"].add_(momentum_update)  # update momentum
+                        std = state["momentum"].sparse_mask(momentum_update.coalesce())
+                        std_values = std._values().sqrt_().add_(group["eps"])
+                        p.data.add_(
+                            make_sparse(
+                                grad_values / std_values.view(std_values.size()[0], 1),
+                                False,
+                            ),
+                            alpha=-clr,
+                        )
+                else:
+                    state["sum"].addcmul_(grad, grad, value=1.0)
+                    std = state["sum"].sqrt().add_(group["eps"])
+                    p.data.addcdiv_(grad, std, value=-clr)
+        self.momentum_initialized = True
+        return loss
--- a/requirements.txt
+++ b/requirements.txt
+future
+numpy
+onnx
+pydot
+torch
+torchviz
+scikit-learn
+tqdm
+torchrec-nightly
+torchx-nightly
--- a/terabyte_0875_loss_accuracy_plots.png
+++ b/terabyte_0875_loss_accuracy_plots.png
--- a/test/dlrm_s_test.sh
+++ b/test/dlrm_s_test.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#WARNING: must have compiled PyTorch
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+dlrm_py="python dlrm_s_pytorch.py"
+echo "Running commands ..."
+#run pytorch
+echo $dlrm_py
+$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
+$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
+echo "All PyTorch tests completed."
+echo "Output files: ppp1, ppp2, ppp3, ppp4"
\ No newline at end of file
--- a/tools/visualize.py
+++ b/tools/visualize.py
--- a/torchrec_dlrm/Dockerfile
+++ b/torchrec_dlrm/Dockerfile
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
+FROM ${FROM_IMAGE_NAME}
+WORKDIR /workspace/torchrec_dlrm
+COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
--- a/torchrec_dlrm/README.MD
+++ b/torchrec_dlrm/README.MD
--- a/torchrec_dlrm/__init__.py
+++ b/torchrec_dlrm/__init__.py
--- a/torchrec_dlrm/aws_component.py
+++ b/torchrec_dlrm/aws_component.py
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torchx.specs as specs
+from torchx.components.dist import ddp
+def run_dlrm_main(num_trainers: int = 8, *script_args: str) -> specs.AppDef:
+    """
+    Args:
+        num_trainers: The number of trainers to use.
+        script_args: A variable number of parameters to provide dlrm_main.py.
+    """
+    cwd = os.getcwd()
+    entrypoint = os.path.join(cwd, "dlrm_main.py")
+    user = os.environ.get("USER")
+    image = f"/data/home/{user}"
+    if num_trainers > 8 and num_trainers % 8 != 0:
+        raise ValueError(
+            "Trainer jobs spanning multiple hosts must be in multiples of 8."
+        )
+    nproc_per_node = 8 if num_trainers >= 8 else num_trainers
+    num_replicas = max(num_trainers // 8, 1)
+    return ddp(
+        *script_args,
+        name="train_dlrm",
+        image=image,
+        # AWS p4d instance (https://aws.amazon.com/ec2/instance-types/p4/).
+        cpu=96,
+        gpu=8,
+        memMB=-1,
+        script=entrypoint,
+        j=f"{num_replicas}x{nproc_per_node}",
+    )
--- a/torchrec_dlrm/data/__init__.py
+++ b/torchrec_dlrm/data/__init__.py
--- a/torchrec_dlrm/data/dlrm_dataloader.py
+++ b/torchrec_dlrm/data/dlrm_dataloader.py
--- a/torchrec_dlrm/data/multi_hot_criteo.py
+++ b/torchrec_dlrm/data/multi_hot_criteo.py
--- a/torchrec_dlrm/dlrm_main.py
+++ b/torchrec_dlrm/dlrm_main.py
--- a/torchrec_dlrm/lr_scheduler.py
+++ b/torchrec_dlrm/lr_scheduler.py
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Copied from https://github.com/facebookresearch/dlrm/blob/mlperf/dlrm_s_pytorch.py
+import sys
+from torch.optim.lr_scheduler import _LRScheduler
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+        super(LRPolicyScheduler, self).__init__(optimizer)
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
--- a/torchrec_dlrm/md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt
+++ b/torchrec_dlrm/md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt
+9287c283b01087427df915257300bf36  day_0_sparse_multi_hot.npz
+f4fedd921a4b214b03d0dfcd31cc30e6  day_1_sparse_multi_hot.npz
+2a15bdd8d25781c4cdcf5d791dfd19f9  day_2_sparse_multi_hot.npz
+0341aaee2f661f9e939a39d7a6be0aea  day_3_sparse_multi_hot.npz
+e8db54dbf5fe438ecb76fcc2d520f31a  day_4_sparse_multi_hot.npz
+fd35a7a2bc0ba63935b4b1c742eca018  day_5_sparse_multi_hot.npz
+7d5c72b6bbe8be1dfa1f69db5c7e64fd  day_6_sparse_multi_hot.npz
+59bcb9855243d3a5c0ae56daa18e1033  day_7_sparse_multi_hot.npz
+b9f7fbccae6bb9fdabf30259b5e305f0  day_8_sparse_multi_hot.npz
+03da5bf484870a3c77f66befc680ad04  day_9_sparse_multi_hot.npz
+eb048fc4fbd7ffa7932b81a523fe5a39  day_10_sparse_multi_hot.npz
+a2ebee45c9836c8c8598610a6a3e9d60  day_11_sparse_multi_hot.npz
+0dd59855e1a7b65c42f7e7af303c610e  day_12_sparse_multi_hot.npz
+7510698f3fcff9f3d7ef9dd478e637aa  day_13_sparse_multi_hot.npz
+562978b7b93e179f1adb9b9f5e1dc338  day_14_sparse_multi_hot.npz
+042967232f016fbccf0d40d72d0b48bb  day_15_sparse_multi_hot.npz
+7b59170fb0e2d1e78f15cb60cea22723  day_16_sparse_multi_hot.npz
+5054c54515d2574cda0f11646787df44  day_17_sparse_multi_hot.npz
+28d3dbf6c70e68f01df12a4c3298f754  day_18_sparse_multi_hot.npz
+db7554263a1754d3e29341d0f03bc2f0  day_19_sparse_multi_hot.npz
+91ee92ffb4810c26e157c1335ef4de06  day_20_sparse_multi_hot.npz
+2c99fad7b146b0ba581dce34f640f44e  day_21_sparse_multi_hot.npz
+c7ba52c5aaf24d76acca22a0cb13b737  day_22_sparse_multi_hot.npz
+c46b7e31ec6f2f8768fa60bdfc0f6e40  day_23_sparse_multi_hot.npz
--- a/torchrec_dlrm/md5sums_preprocessed_criteo_click_logs_dataset.txt
+++ b/torchrec_dlrm/md5sums_preprocessed_criteo_click_logs_dataset.txt
+427113b0c4d85a8fceaf793457302067  day_0_dense.npy
+4db255ce4388893e7aa1dcf157077975  day_0_labels.npy
+8b444e74159dbede896e2f3b5ed31ac0  day_0_sparse.npy
+3afc11c56062d8bbea4df300b5a42966  day_1_dense.npy
+fb40746738a7c6f4ee021033bdd518c5  day_1_labels.npy
+61e95a487c955b515155b31611444f32  day_1_sparse.npy
+4e73d5bb330c43826665bec142c6b407  day_2_dense.npy
+f0adfec8191781e3f201d45f923e6ea1  day_2_labels.npy
+0473d30872cd6e582c5da0272a0569f8  day_2_sparse.npy
+df1f3395e0da4a06aa23b2e069ff3ad9  day_3_dense.npy
+69caadf4d219f18b83f3591fe76f17c7  day_3_labels.npy
+d6b0d02ff18da470b7ee17f97d5380e0  day_3_sparse.npy
+27868a93adc66c47d4246acbad8bb689  day_4_dense.npy
+c4a6a16342f0770d67d689c6c173c681  day_4_labels.npy
+ca54008489cb84becc3f37e7b29035c7  day_4_sparse.npy
+e9bc6de06d09b1feebf857d9786ee15c  day_5_dense.npy
+9e3e17f345474cfbde5d62b543e07d6b  day_5_labels.npy
+d1374ee84f80ea147957f8af0e12ebe4  day_5_sparse.npy
+09c8bf0fd4798172e0369134ddc7204a  day_6_dense.npy
+945cef1132ceab8b23f4d0e269522be2  day_6_labels.npy
+e4df1c271e1edd72ee4658a39cca2888  day_6_sparse.npy
+ae718f0d6d29a8b605ae5d12fad3ffcc  day_7_dense.npy
+5ff5e7eef5b88b80ef03d06fc7e81bcf  day_7_labels.npy
+cbcb7501a6b74a45dd5c028c13a4afbc  day_7_sparse.npy
+5a589746fd15819afbc70e2503f94b35  day_8_dense.npy
+43871397750dfdc69cadcbee7e95f2bd  day_8_labels.npy
+c1fb4369c7da27d23f4c7f97c8893250  day_8_sparse.npy
+4bb86eecb92eb4e3368085c2b1bab131  day_9_dense.npy
+f851934555147d436131230ebbdd5609  day_9_labels.npy
+e4ac0fb8a030f0769541f88142c9f931  day_9_sparse.npy
+7fc29f50da6c60185381ca4ad1cb2059  day_10_dense.npy
+e3b3f6f974c4820064db0046bbf954c8  day_10_labels.npy
+1018a9ab88c4a7369325c9d6df73b411  day_10_sparse.npy
+df822ae73cbaa016bf7d371d87313b56  day_11_dense.npy
+26219e9c89c6ce831e7da273da666df1  day_11_labels.npy
+f1596fc0337443a6672a864cd541fb05  day_11_sparse.npy
+015968b4d9940ec9e28cc34788013d6e  day_12_dense.npy
+f0ca7ce0ab6033cdd355df94d11c7ed7  day_12_labels.npy
+03a2ebd22b01cc18b6e338de77b4103f  day_12_sparse.npy
+9d79239a9e976e4dd9b8839c7cbe1eba  day_13_dense.npy
+4b099b9200bbb490afc08b5cd63daa0e  day_13_labels.npy
+2b507e0f97d972ea6ada9b3af64de151  day_13_sparse.npy
+9242e6c974603ec235f163f72fdbc766  day_14_dense.npy
+80cae15e032ffb9eff292738ba4d0dce  day_14_labels.npy
+3dccc979f7c71fae45a10c98ba6c9cb7  day_14_sparse.npy
+64c6c0fcd0940f7e0d7001aa945ec8f8  day_15_dense.npy
+a6a730d1ef55368f3f0b21d32b039662  day_15_labels.npy
+c852516852cc404cb40d4de8626d2ca1  day_15_sparse.npy
+5c75b60e63e9cf98dec13fbb64839c10  day_16_dense.npy
+5a71a29d8df1e8baf6bf28353f1588d4  day_16_labels.npy
+6c838050751697a91bbf3e68ffd4a696  day_16_sparse.npy
+9798bccb5a67c5eac834153ea8bbe110  day_17_dense.npy
+0a814b7eb83f375dd5a555ade6908356  day_17_labels.npy
+40d2bc23fbcccb3ddb1390cc5e694cf0  day_17_sparse.npy
+cda094dfe7f5711877a6486f9863cd4b  day_18_dense.npy
+a4fa26ada0d4c312b7e3354de0f5ee30  day_18_labels.npy
+51711de9194737813a74bfb25c0f5d30  day_18_sparse.npy
+0f0b2c0ed279462cdcc6f79252fd3395  day_19_dense.npy
+b21ad457474b01bd3f95fc46b6b9f04b  day_19_labels.npy
+dd4b72cd704981441d17687f526e42ae  day_19_sparse.npy
+95ffc084f6cafe382afe72cbcae186bc  day_20_dense.npy
+9555e572e8bee22d71db8c2ac121ea8a  day_20_labels.npy
+bc9a8c79c93ea39f32230459b4c4572a  day_20_sparse.npy
+4680683973be5b1a890c9314cfb2e93b  day_21_dense.npy
+672edc866e7ff1928d15338a99e5f336  day_21_labels.npy
+e4a8ae42a6d46893da6edb73e7d8a3f7  day_21_sparse.npy
+3d56f190639398da2bfdc33f87cd34f0  day_22_dense.npy
+733da710c5981cb67d041aa1039e4e6b  day_22_labels.npy
+42ef88d6bb2550a88711fed6fc144846  day_22_sparse.npy
+cdf7af87cbc7e9b468c0be46b1767601  day_23_dense.npy
+dd68f93301812026ed6f58dfb0757fa7  day_23_labels.npy
+0c33f1562529cc3bca7f3708e2be63c9  day_23_sparse.npy