Initial commit

9c8a2a14 · xinghao · 9c8a2a14 · 9c8a2a14 · 9c8a2a14 · 9c8a2a14
Commit 9c8a2a14 authored Oct 21, 2025 by xinghao
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# Contributing to DLRM
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* in general, please maintain a consistent style with the rest of the code
+
+## License
+By contributing to DLRM, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/Dockerfile
+++ b/Dockerfile
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+RUN pip install torch==1.3.1
+
+WORKDIR /code
+ADD . .
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+DLRM：Deep Learning Recommendation Model for Personalization and Recommendation Systems
+=================================================================================
+简介
+------------
+一个深度学习推荐模型（DLRM）的实现。
+模型的输入由稠密特征和稀疏特征组成。前者是一个浮点值向量；后者是一组稀疏索引，用于查找嵌入表中的向量（这些嵌入表由浮点向量组成）。
+选取到的这些向量会被送入若干 多层感知机（MLP）网络（通常在示意图中用三角形表示），在某些情况下，这些向量之间还会通过特定的算子（Ops）进行交互
+
+```
+output:
+                    probability of a click
+model:                        |
+                             /\
+                            /__\
+                              |
+      _____________________> Op  <___________________
+    /                         |                      \
+   /\                        /\                      /\
+  /__\                      /__\           ...      /__\
+   |                          |                       |
+   |                         Op                      Op
+   |                    ____/__\_____           ____/__\____
+   |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+input:
+[ dense features ]     [sparse indices] , ..., [sparse indices]
+```
+对模型各层的更精确定义：
+
+1）MLP（多层感知机）的全连接层
+
+    z = f(y)
+    
+    y = Wx + b
+
+2）嵌入查找（针对一组稀疏索引 p=[p1,...,pk]p = [p_1, ..., p_k]p=[p1,...,pk]）
+
+    z = Op(e1,...,ek)
+    
+    obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+
+3）算子 Op 可以是以下几种之一
+
+    Sum(e1,...,ek) = e1 + ... + ek
+    
+    Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+    
+    Cat(e1,...,ek) = [e1', ..., ek']'
+    
+    where ' denotes transpose operation
+
+部署
+--------------
+### Docker
+
+**容器创建**
+
+```bash
+docker run --shm-size 500g --network=host --name=dlrm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /path/to/workspace/:/path/to/workspace/ -v /opt/hyhal:/opt/hyhal:ro -it image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10 bash
+```
+
+**依赖安装**
+
+```bash
+cd dlrm
+pip install -r requirements.txt
+pip install tensorboard
+```
+
+注意：使用 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 会导致 `torchrec-nightly` 相关依赖安装失败
+
+Demo
+--------------------
+
+1）使用微型模型运行代码
+
+```bash
+python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
+```
+
+<img src="./images/image1.png" width="900">
+
+2）在调试模式下使用微型模型运行代码
+
+```bash
+python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
+```
+
+<img src="./images/image2.png" width="900">
+
+测试
+-------
+
+验证代码功能正确性
+```bash
+./test/dlrm_s_test.sh
+```
+
+<img src="./images/image3.png" width="900">
+
+基准测试
+------------
+
+1）性能基准测试
+
+```bash
+./bench/dlrm_s_benchmark.sh
+```
+
+2）代码支持数据集 [Criteo Kaggle Display Advertising Challenge Dataset](https://ailab.criteo.com/ressources/)
+
+   - 请按以下步骤准备数据，以便在 DLRM 代码中使用：
+     
+     - 首先，指定下载好的原始数据文件（train.txt），使用参数 `--raw-data-file=<path/train.txt>`
+     - 然后对数据进行预处理（分类、跨天合并等），以便在 DLRM 代码中使用
+     - 预处理后的数据会存储为 `*.npz` 文件，路径为 `<root_dir>/input/*.npz`
+     - 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用，参数为 `--processed-data-file=<path/*.npz>`
+     
+- 可以使用以下脚本对模型进行训练
+
+  ```bash
+  ./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
+  ```
+
+  若要启用gpu，添加参数 `--use-gpu`，若要启用纯推理模型，添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
+
+<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
+
+3）代码支持数据集 [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
+
+   - 请按以下步骤准备数据，以便在 DLRM 代码中使用：
+     - 首先，下载原始数据文件 `day_0.gz` 到 `day_23.gz` 并解压
+     - 使用参数 `--raw-data-file=<path/day>` 指定解压后的文本文件位置 `day_0` 到 `day_23`（天数会自动追加）
+     - 然后对数据进行预处理（分类、跨天合并等），以便在 DLRM 代码中使用
+     - 预处理后的数据会存储为 `*.npz` 文件，路径为 `<root_dir>/input/*.npz`
+     - 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用，参数为 `--processed-data-file=<path/*.npz>`
+   - 可以使用以下脚本对模型进行训练
+
+```bash
+./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"]
+```
+
+	若要启用gpu，添加参数 `--use-gpu`，若要启用纯推理模型，添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
+
+- 对应的预训练模型可从以下链接下载：[dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)
+
+<img src="./terabyte_0875_loss_accuracy_plots.png" width="900" height="320">
+
+4）代码支持 [MLPerf benchmark](https://mlperf.org).
+
+   - 请参考以下训练参数
+   ```bash
+--mlperf-logging 用于跟踪多个指标，包括曲线下面积（AUC）
+
+--mlperf-acc-threshold 允许基于准确率指标提前停止训练
+
+--mlperf-auc-threshold 允许基于 AUC 指标提前停止训练
+
+--mlperf-bin-loader 启用将数据预处理成单个二进制文件
+
+--mlperf-bin-shuffle 控制是否对小批量数据进行随机打乱
+   ```
+   - MLPerf 模型可使用以下脚本进行训练。
+   ```bash
+./bench/run_and_time.sh [--use-gpu]
+   ```
+   - 对应的预训练模型可从以下链接下载：[dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt)
+
+5）该代码现在支持同步分布式训练，支持 gloo/nccl/mpi 后端，同时提供了 [PyTorch 分布式启动器](https://pytorch.org/docs/stable/distributed.html#launch-utility) 和 Mpirun 的启动方式。对于 MPI，用户需要自行编写 MPI 启动脚本来配置运行主机。例如，使用 PyTorch 分布式启动器，可以使用如下命令作为启动脚本：
+
+```bash
+# 在单节点 8 GPU 环境下，使用 NCCL 作为后端处理随机生成的数据集时：
+python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
+--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
+
+# 对于多节点环境，用户可以根据启动器手册添加相关参数，例如：
+--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
+```
+
+
+模型检查点保存/加载
+-------------------------------
+在训练过程中，可以使用参数 `--save-model=<path/model.pt>` 保存模型
+
+当测试准确率有所提升时（按 `--test-freq` 指定的间隔检查），模型会被保存
+
+已保存的模型可以通过 `--load-model=<path/model.pt>` 加载
+
+加载后，模型可以用于继续训练，已保存的模型相当于一个检查点或者，也可以通过指定 `--inference-only` 选项，仅使用保存的模型在测试数据集上进行评估
+
+
+参考资料
+-------
+https://github.com/facebookresearch/dlrm
--- a/bench/dlrm_s_benchmark.sh
+++ b/bench/dlrm_s_benchmark.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+cpu=1
+gpu=1
+pt=1
+
+ncores=28 #12 #6
+nsockets="0"
+
+ngpus="1 2 4 8"
+
+numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+data=random #synthetic
+print_freq=100
+rand_seed=727
+
+#Model param
+mb_size=2048 #1024 #512 #256
+nbatches=1000 #500 #100
+bot_mlp="512-512-64"
+top_mlp="1024-1024-1024-1"
+emb_size=64
+nindices=100
+emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
+interaction="dot"
+tnworkers=0
+tmb_size=16384
+
+#_args="--mini-batch-size="${mb_size}\
+_args=" --num-batches="${nbatches}\
+" --data-generation="${data}\
+" --arch-mlp-bot="${bot_mlp}\
+" --arch-mlp-top="${top_mlp}\
+" --arch-sparse-feature-size="${emb_size}\
+" --arch-embedding-size="${emb}\
+" --num-indices-per-lookup="${nindices}\
+" --arch-interaction-op="${interaction}\
+" --numpy-rand-seed="${rand_seed}\
+" --print-freq="${print_freq}\
+" --print-time"\
+" --enable-profiling "
+
+# CPU Benchmarking
+if [ $cpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "CPU Benchmarking - running on $ncores cores"
+  echo "--------------------------------------------"
+  if [ $pt = 1 ]; then
+    outf="model1_CPU_PT_$ncores.log"
+    outp="dlrm_s_pytorch.prof"
+    echo "-------------------------------"
+    echo "Running PT (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file(s)
+    mv $outp ${outf//".log"/".prof"}
+    mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+  fi
+fi
+
+# GPU Benchmarking
+if [ $gpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "GPU Benchmarking - running on $ngpus GPUs"
+  echo "--------------------------------------------"
+  for _ng in $ngpus
+  do
+    # weak scaling
+    # _mb_size=$((mb_size*_ng))
+    # strong scaling
+    _mb_size=$((mb_size*1))
+    _gpus=$(seq -s, 0 $((_ng-1)))
+    cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
+    echo "-------------------"
+    echo "Using GPUS: "$_gpus
+    echo "-------------------"
+    if [ $pt = 1 ]; then
+      outf="model1_GPU_PT_$_ng.log"
+      outp="dlrm_s_pytorch.prof"
+      echo "-------------------------------"
+      echo "Running PT (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file(s)
+      mv $outp ${outf//".log"/".prof"}
+      mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+    fi
+  done
+fi
\ No newline at end of file
--- a/bench/dlrm_s_criteo_kaggle.sh
+++ b/bench/dlrm_s_criteo_kaggle.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
+
+echo "done"
\ No newline at end of file
--- a/bench/dlrm_s_criteo_terabyte.sh
+++ b/bench/dlrm_s_criteo_terabyte.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
+
+echo "done"
\ No newline at end of file
--- a/bench/run_and_time.sh
+++ b/bench/run_and_time.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
+
+echo "done"
--- a/cython/cython_compile.py
+++ b/cython/cython_compile.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: compile .so from python code
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from distutils.extension import Extension
+
+from Cython.Build import cythonize
+
+from setuptools import setup
+
+ext_modules = [
+    Extension(
+        "data_utils_cython",
+        ["data_utils_cython.pyx"],
+        extra_compile_args=["-O3"],
+        extra_link_args=["-O3"],
+    )
+]
+
+setup(name="data_utils_cython", ext_modules=cythonize(ext_modules))
--- a/cython/cython_criteo.py
+++ b/cython/cython_criteo.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: run dataset pre-processing in standalone mode
+# WARNING: These steps are required to work with Cython
+# 1. Instal Cython
+# > sudo yum install Cython
+# 2. Please copy data_utils.py into data_utils_cython.pyx
+# 3. Compile the data_utils_cython.pyx to generate .so
+# (it's important to keep extension .pyx rather than .py
+#  to ensure the C/C++ .so no .py is loaded at import time)
+# > python cython_compile.py build_ext --inplace
+# This should create data_utils_cython.so, which can be loaded below with "import"
+# 4. Run standalone datatset preprocessing to generate .npz files
+# a. Kaggle
+# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
+#   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
+# b. Terabyte
+# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
+#   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import data_utils_cython as duc
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Preprocess Criteo dataset")
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    duc.loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+    )
--- a/data_loader_terabyte.py
+++ b/data_loader_terabyte.py
+# @lint-ignore-every LICENSELINT
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import math
+
+import os
+import time
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+
+class DataLoader:
+    """
+    DataLoader dedicated for the Criteo Terabyte Click Logs dataset
+    """
+
+    def __init__(
+        self,
+        data_filename,
+        data_directory,
+        days,
+        batch_size,
+        max_ind_range=-1,
+        split="train",
+        drop_last_batch=False,
+    ):
+        self.data_filename = data_filename
+        self.data_directory = data_directory
+        self.days = days
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+
+        total_file = os.path.join(data_directory, data_filename + "_day_count.npz")
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"][np.array(days)]
+
+        self.length = sum(total_per_file)
+        if split == "test" or split == "val":
+            self.length = int(np.ceil(self.length / 2.0))
+        self.split = split
+        self.drop_last_batch = drop_last_batch
+
+    def __iter__(self):
+        return iter(
+            _batch_generator(
+                self.data_filename,
+                self.data_directory,
+                self.days,
+                self.batch_size,
+                self.split,
+                self.drop_last_batch,
+                self.max_ind_range,
+            )
+        )
+
+    def __len__(self):
+        if self.drop_last_batch:
+            return self.length // self.batch_size
+        else:
+            return math.ceil(self.length / self.batch_size)
+
+
+def _transform_features(
+    x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
+):
+    if max_ind_range > 0:
+        x_cat_batch = x_cat_batch % max_ind_range
+
+    if flag_input_torch_tensor:
+        x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
+        x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
+        y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
+    else:
+        x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
+        x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
+        y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
+
+    batch_size = x_cat_batch.shape[0]
+    feature_count = x_cat_batch.shape[1]
+    lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
+
+    return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
+
+
+def _batch_generator(
+    data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
+):
+    previous_file = None
+    for day in days:
+        filepath = os.path.join(
+            data_directory, data_filename + "_{}_reordered.npz".format(day)
+        )
+
+        # print('Loading file: ', filepath)
+        with np.load(filepath) as data:
+            x_int = data["X_int"]
+            x_cat = data["X_cat"]
+            y = data["y"]
+
+        samples_in_file = y.shape[0]
+        batch_start_idx = 0
+        if split == "test" or split == "val":
+            length = int(np.ceil(samples_in_file / 2.0))
+            if split == "test":
+                samples_in_file = length
+            elif split == "val":
+                batch_start_idx = samples_in_file - length
+
+        while batch_start_idx < samples_in_file - batch_size:
+            missing_samples = batch_size
+            if previous_file is not None:
+                missing_samples -= previous_file["y"].shape[0]
+
+            current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
+
+            x_int_batch = x_int[current_slice]
+            x_cat_batch = x_cat[current_slice]
+            y_batch = y[current_slice]
+
+            if previous_file is not None:
+                x_int_batch = np.concatenate(
+                    [previous_file["x_int"], x_int_batch], axis=0
+                )
+                x_cat_batch = np.concatenate(
+                    [previous_file["x_cat"], x_cat_batch], axis=0
+                )
+                y_batch = np.concatenate([previous_file["y"], y_batch], axis=0)
+                previous_file = None
+
+            if x_int_batch.shape[0] != batch_size:
+                raise ValueError("should not happen")
+
+            yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
+
+            batch_start_idx += missing_samples
+        if batch_start_idx != samples_in_file:
+            current_slice = slice(batch_start_idx, samples_in_file)
+            if previous_file is not None:
+                previous_file = {
+                    "x_int": np.concatenate(
+                        [previous_file["x_int"], x_int[current_slice]], axis=0
+                    ),
+                    "x_cat": np.concatenate(
+                        [previous_file["x_cat"], x_cat[current_slice]], axis=0
+                    ),
+                    "y": np.concatenate([previous_file["y"], y[current_slice]], axis=0),
+                }
+            else:
+                previous_file = {
+                    "x_int": x_int[current_slice],
+                    "x_cat": x_cat[current_slice],
+                    "y": y[current_slice],
+                }
+
+    if not drop_last:
+        yield _transform_features(
+            previous_file["x_int"],
+            previous_file["x_cat"],
+            previous_file["y"],
+            max_ind_range,
+        )
+
+
+def _test():
+    generator = _batch_generator(
+        data_filename="day",
+        data_directory="./input",
+        days=range(23),
+        split="train",
+        batch_size=2048,
+        drop_last=True,
+        max_ind_range=-1,
+    )
+    t1 = time.time()
+    for x_int, lS_o, x_cat, y in generator:
+        t2 = time.time()
+        time_diff = t2 - t1
+        t1 = t2
+        print(
+            "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
+                time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
+            )
+        )
+
+
+class CriteoBinDataset(Dataset):
+    """Binary version of criteo dataset."""
+
+    def __init__(
+        self,
+        data_file,
+        counts_file,
+        batch_size=1,
+        max_ind_range=-1,
+        bytes_per_feature=4,
+    ):
+        # dataset
+        self.tar_fea = 1  # single target
+        self.den_fea = 13  # 13 dense  features
+        self.spa_fea = 26  # 26 sparse features
+        self.tad_fea = self.tar_fea + self.den_fea
+        self.tot_fea = self.tad_fea + self.spa_fea
+
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        self.bytes_per_entry = bytes_per_feature * self.tot_fea * batch_size
+
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+
+        print("data file:", data_file, "number of batches:", self.num_entries)
+        self.file = open(data_file, "rb")
+
+        with np.load(counts_file) as data:
+            self.counts = data["counts"]
+
+        # hardcoded for now
+        self.m_den = 13
+
+    def __len__(self):
+        return self.num_entries
+
+    def __getitem__(self, idx):
+        self.file.seek(idx * self.bytes_per_entry, 0)
+        raw_data = self.file.read(self.bytes_per_entry)
+        array = np.frombuffer(raw_data, dtype=np.int32)
+        tensor = torch.from_numpy(array).view((-1, self.tot_fea))
+
+        return _transform_features(
+            x_int_batch=tensor[:, 1:14],
+            x_cat_batch=tensor[:, 14:],
+            y_batch=tensor[:, 0],
+            max_ind_range=self.max_ind_range,
+            flag_input_torch_tensor=True,
+        )
+
+    def __del__(self):
+        self.file.close()
+
+
+def numpy_to_binary(input_files, output_file_path, split="train"):
+    """Convert the data to a binary format to be read with CriteoBinDataset."""
+
+    # WARNING - both categorical and numerical data must fit into int32 for
+    # the following code to work correctly
+
+    with open(output_file_path, "wb") as output_file:
+        if split == "train":
+            for input_file in input_files:
+                print("Processing file: ", input_file)
+
+                np_data = np.load(input_file)
+                np_data = np.concatenate(
+                    [np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
+                    axis=1,
+                )
+                np_data = np_data.astype(np.int32)
+
+                output_file.write(np_data.tobytes())
+        else:
+            assert len(input_files) == 1
+            np_data = np.load(input_files[0])
+            np_data = np.concatenate(
+                [np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
+                axis=1,
+            )
+            np_data = np_data.astype(np.int32)
+
+            samples_in_file = np_data.shape[0]
+            midpoint = int(np.ceil(samples_in_file / 2.0))
+            if split == "test":
+                begin = 0
+                end = midpoint
+            elif split == "val":
+                begin = midpoint
+                end = samples_in_file
+            else:
+                raise ValueError("Unknown split value: ", split)
+
+            output_file.write(np_data[begin:end].tobytes())
+
+
+def _preprocess(args):
+    train_files = [
+        "{}_{}_reordered.npz".format(args.input_data_prefix, day)
+        for day in range(0, 23)
+    ]
+
+    test_valid_file = args.input_data_prefix + "_23_reordered.npz"
+
+    os.makedirs(args.output_directory, exist_ok=True)
+    for split in ["train", "val", "test"]:
+        print("Running preprocessing for split =", split)
+
+        output_file = os.path.join(args.output_directory, "{}_data.bin".format(split))
+
+        input_files = train_files if split == "train" else [test_valid_file]
+        numpy_to_binary(
+            input_files=input_files, output_file_path=output_file, split=split
+        )
+
+
+def _test_bin():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_directory", required=True)
+    parser.add_argument("--input_data_prefix", required=True)
+    parser.add_argument("--split", choices=["train", "test", "val"], required=True)
+    args = parser.parse_args()
+
+    _preprocess(args)
+
+    binary_data_file = os.path.join(
+        args.output_directory, "{}_data.bin".format(args.split)
+    )
+
+    counts_file = os.path.join(args.output_directory, "day_fea_count.npz")
+    dataset_binary = CriteoBinDataset(
+        data_file=binary_data_file,
+        counts_file=counts_file,
+        batch_size=2048,
+    )
+    from dlrm_data_pytorch import (
+        collate_wrapper_criteo_offset as collate_wrapper_criteo,
+        CriteoDataset,
+    )
+
+    binary_loader = torch.utils.data.DataLoader(
+        dataset_binary,
+        batch_size=None,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=None,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    original_dataset = CriteoDataset(
+        dataset="terabyte",
+        max_ind_range=10 * 1000 * 1000,
+        sub_sample_rate=1,
+        randomize=True,
+        split=args.split,
+        raw_path=args.input_data_prefix,
+        pro_data="dummy_string",
+        memory_map=True,
+    )
+
+    original_loader = torch.utils.data.DataLoader(
+        original_dataset,
+        batch_size=2048,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_wrapper_criteo,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    assert len(dataset_binary) == len(original_loader)
+    for i, (old_batch, new_batch) in tqdm(
+        enumerate(zip(original_loader, binary_loader)), total=len(dataset_binary)
+    ):
+        for j in range(len(new_batch)):
+            if not np.array_equal(old_batch[j], new_batch[j]):
+                raise ValueError("FAILED: Datasets not equal")
+        if i > len(dataset_binary):
+            break
+    print("PASSED")
+
+
+if __name__ == "__main__":
+    _test()
+    _test_bin()
--- a/data_utils.py
+++ b/data_utils.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: generate inputs and targets for the DLRM benchmark
+#
+# Utility function(s) to download and pre-process public data sets
+#   - Criteo Kaggle Display Advertising Challenge Dataset
+#     https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#   - Criteo Terabyte Dataset
+#     https://labs.criteo.com/2013/12/download-terabyte-click-logs
+#
+# After downloading dataset, run:
+#   getCriteoAdData(
+#       datafile="<path-to-train.txt>",
+#       o_filename=kaggleAdDisplayChallenge_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=7,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=True,
+#       memory_map=False
+#   )
+#   getCriteoAdData(
+#       datafile="<path-to-day_{0,...,23}>",
+#       o_filename=terabyte_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=24,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=False,
+#       memory_map=False
+#   )
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+from multiprocessing import Manager, Process
+
+# import os
+from os import path
+
+# import io
+# from io import StringIO
+# import collections as coll
+
+import numpy as np
+
+
+def convertUStringToDistinctIntsDict(mat, convertDicts, counts):
+    # Converts matrix of unicode strings into distinct integers.
+    #
+    # Inputs:
+    #     mat (np.array): array of unicode strings to convert
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+    #
+    # Outputs:
+    #     out (np.array): array of output integers
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+
+    # check if convertDicts and counts match correct length of mat
+    if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of convertDicts or counts does not match input shape")
+        print("Generating convertDicts and counts...")
+
+        convertDicts = [{} for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+
+    for j in range(mat.shape[1]):
+        for i in range(mat.shape[0]):
+            # add to convertDict and increment count
+            if mat[i, j] not in convertDicts[j]:
+                convertDicts[j][mat[i, j]] = counts[j]
+                counts[j] += 1
+            out[i, j] = convertDicts[j][mat[i, j]]
+
+    return out, convertDicts, counts
+
+
+def convertUStringToDistinctIntsUnique(mat, mat_uni, counts):
+    # mat is an array of 0,...,# samples, with each being 26 categorical features
+
+    # check if mat_unique and counts match correct length of mat
+    if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of mat_unique or counts does not match input shape")
+        print("Generating mat_unique and counts...")
+
+        mat_uni = [np.array([]) for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+    ind_map = [np.array([]) for _ in range(mat.shape[1])]
+
+    # find out and assign unique ids to features
+    for j in range(mat.shape[1]):
+        m = mat_uni[j].size
+        mat_concat = np.concatenate((mat_uni[j], mat[:, j]))
+        mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True)
+        out[:, j] = ind_map[j][m:]
+        counts[j] = mat_uni[j].size
+
+    return out, mat_uni, counts
+
+
+def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
+    # Process Kaggle Display Advertising Challenge or Terabyte Dataset
+    # by converting unicode strings in X_cat to integers and
+    # converting negative integer values in X_int.
+    #
+    # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
+    #
+    # Inputs:
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   i (int): splits in the dataset (typically 0 to 7 or 0 to 24)
+
+    # process data if not all files exist
+    filename_i = npzfile + "_{0}_processed.npz".format(i)
+
+    if path.exists(filename_i):
+        print("Using existing " + filename_i, end="\n")
+    else:
+        print("Not existing " + filename_i)
+        with np.load(npzfile + "_{0}.npz".format(i)) as data:
+            # categorical features
+            """
+            # Approach 1a: using empty dictionaries
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
+                data["X_cat"], convertDicts, counts
+            )
+            """
+            """
+            # Approach 1b: using empty np.unique
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
+                data["X_cat"], convertDicts, counts
+            )
+            """
+            # Approach 2a: using pre-computed dictionaries
+            X_cat_t = np.zeros(data["X_cat_t"].shape)
+            for j in range(26):
+                for k, x in enumerate(data["X_cat_t"][j, :]):
+                    X_cat_t[j, k] = convertDicts[j][x]
+            # continuous features
+            X_int = data["X_int"]
+            X_int[X_int < 0] = 0
+            # targets
+            y = data["y"]
+
+        np.savez_compressed(
+            filename_i,
+            # X_cat = X_cat,
+            X_cat=np.transpose(X_cat_t),  # transpose of the data
+            X_int=X_int,
+            y=y,
+        )
+        print("Processed " + filename_i, end="\n")
+    # sanity check (applicable only if counts have been pre-computed & are re-computed)
+    # for j in range(26):
+    #    if pre_comp_counts[j] != counts[j]:
+    #        sys.exit("ERROR: Sanity check on counts has failed")
+    # print("\nSanity check on counts passed")
+
+    return
+
+
+def concatCriteoAdData(
+    d_path,
+    d_file,
+    npzfile,
+    trafile,
+    days,
+    data_split,
+    randomize,
+    total_per_file,
+    total_count,
+    memory_map,
+    o_filename,
+):
+    # Concatenates different days and saves the result.
+    #
+    # Inputs:
+    #   days (int): total number of days in the dataset (typically 7 or 24)
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   o_filename (str): output file name
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    if memory_map:
+        # dataset break up per fea
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        # create offset per file
+        offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            offset_per_file[i + 1] += offset_per_file[i]
+
+        """
+        # Approach 1, 2 and 3 use indices, while Approach 4 does not use them
+        # create indices
+        indices = np.arange(total_count)
+        if data_split == "none":
+            if randomize == "total":
+                indices = np.random.permutation(indices)
+        else:
+            indices = np.array_split(indices, offset_per_file[1:-1])
+
+            # randomize train data (per day)
+            if randomize == "day":  # or randomize == "total":
+                for i in range(len(indices) - 1):
+                    indices[i] = np.random.permutation(indices[i])
+                print("Randomized indices per day ...")
+
+            train_indices = np.concatenate(indices[:-1])
+            test_indices = indices[-1]
+
+            # randomize train data (across days)
+            if randomize == "total":
+                train_indices = np.random.permutation(train_indices)
+                print("Randomized indices across days ...")
+
+            indices = np.concatenate((train_indices, test_indices))
+        # no reordering
+        # indices = np.arange(total_count)
+        """
+        """
+        # Approach 1: simple and slow (no grouping is used)
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    if j < tar_fea:
+                        fj[indices[start:end]] = y
+                    elif tar_fea <= j and j < tad_fea:
+                        fj[indices[start:end]] = X_int_t[j - tar_fea, :]
+                    else:
+                        fj[indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    if j < tar_fea:
+                        y = fj[start:end]
+                    elif tar_fea <= j and j < tad_fea:
+                        X_int_t[j - tar_fea, :] = fj[start:end]
+                    else:
+                        X_cat_t[j - tad_fea, :] = fj[start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        """
+        """
+        # Approach 2: group days
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            group_day = 3  # e.g. 8, 4 or 3
+            group_num = days // group_day
+            file_group = [i*group_day for i in range(group_num)] + [days]
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t = [0]*group_size
+                X_int_t = [0]*group_size
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                    # setup start and end ranges
+                    start[ig] = offset_per_file[i]
+                    end[ig] = offset_per_file[i + 1]
+                    # print(filename_i)
+                    # load a group of files
+                    with np.load(filename_i) as data:
+                        X_cat_t[ig] = np.transpose(data["X_cat"])
+                        X_int_t[ig] = np.transpose(data["X_int"])
+                        y[ig] = data["y"]
+                    # sanity check
+                    if total_per_file[i] != len(y[ig]):
+                        sys.exit("ERROR: sanity check on number of samples failed")
+                # print("start=" + str(start) + " end=" + str(end)
+                #  + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for ig in range(group_size):
+                        if j < tar_fea:
+                            fj[indices[start[ig]:end[ig]]] = y[ig]
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :]
+                        else:
+                            fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t= []; X_int_t = []
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    X_int_t.append(np.zeros((den_fea, total_per_file[i])))
+                    X_cat_t.append(np.zeros((spa_fea, total_per_file[i])))
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    # load a group of files
+                    for ig in range(group_size):
+                        i = file_group[ii] + ig
+                        # setup start and end ranges
+                        start[ig] = offset_per_file[i]
+                        end[ig] = offset_per_file[i + 1]
+                        # load data for the group of files
+                        if j < tar_fea:
+                            y[ig] = fj[start[ig]:end[ig]]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]]
+                        else:
+                            X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]]
+                    del fj
+
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                    print("Creating " + filename_i)
+                    np.savez_compressed(
+                        filename_i,
+                        X_cat=np.transpose(X_cat_t[ig]),  # transpose of the data
+                        X_int=np.transpose(X_int_t[ig]),  # transpose of the data
+                        y=y[ig],
+                    )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        """
+        """
+        # Approach 3: group features
+        # check if data already exists
+        group_fea = 5  # e.g. 8, 5 or 4
+        group_num = tot_fea // group_fea
+        if tot_fea % group_fea != 0:  # sanity check
+            sys.exit("ERROR: the group_fea must divided tot_fea evenly.")
+        recreate_flag = False
+        for jn in range(group_num):
+            filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                jn, group_fea
+            )
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((group_fea, total_count))
+            for jn in range(group_num):
+                filename_j = trafile + "_{0}_reordered{1}".format(
+                    jn, group_fea
+                )
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            fj[jg, indices[start:end]] = y
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :]
+                        else:
+                            fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing" + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            y = fj[jg, start:end]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[j - tar_fea, :] = fj[jg, start:end]
+                        else:
+                            X_cat_t[j - tad_fea, :] = fj[jg, start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+
+        else:
+            print("Reordered day files already exist, skipping ...")
+        """
+
+        # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm
+        # 1st pass of FYR shuffle
+        # check if data already exists
+        recreate_flag = False
+        for j in range(days):
+            filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+            filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+            filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+            if (
+                path.exists(filename_j_y)
+                and path.exists(filename_j_d)
+                and path.exists(filename_j_s)
+            ):
+                print(
+                    "Using existing\n"
+                    + filename_j_y
+                    + "\n"
+                    + filename_j_d
+                    + "\n"
+                    + filename_j_s
+                )
+            else:
+                recreate_flag = True
+        # reorder across buckets using sampling
+        if recreate_flag:
+            # init intermediate files (.npy appended automatically)
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s".format(j)
+                np.save(filename_j_y, np.zeros((total_per_file[j])))
+                np.save(filename_j_d, np.zeros((total_per_file[j], den_fea)))
+                np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea)))
+            # start processing files
+            total_counter = [0] * days
+            for i in range(days):
+                filename_i = npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                print("Reordering (1st pass) " + filename_i)
+
+                # create buckets using sampling of random ints
+                # from (discrete) uniform distribution
+                buckets = []
+                for _j in range(days):
+                    buckets.append([])
+                counter = [0] * days
+                days_to_sample = days if data_split == "none" else days - 1
+                if randomize == "total":
+                    rand_u = np.random.randint(low=0, high=days_to_sample, size=size)
+                    for k in range(size):
+                        # sample and make sure elements per buckets do not overflow
+                        if data_split == "none" or i < days - 1:
+                            # choose bucket
+                            p = rand_u[k]
+                            # retry of the bucket is full
+                            while total_counter[p] + counter[p] >= total_per_file[p]:
+                                p = np.random.randint(low=0, high=days_to_sample)
+                        else:  # preserve the last day/bucket if needed
+                            p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+                else:  # randomize is day or none
+                    for k in range(size):
+                        # do not sample, preserve the data in this bucket
+                        p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+
+                # sanity check
+                if np.sum(counter) != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                # print(counter)
+                # print(str(np.sum(counter)) + " = " + str(size))
+                # print([len(x) for x in buckets])
+                # print(total_counter)
+
+                # partially feel the buckets
+                for j in range(days):
+                    filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                    filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                    filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                    start = total_counter[j]
+                    end = total_counter[j] + counter[j]
+                    # target buckets
+                    fj_y = np.load(filename_j_y, mmap_mode="r+")
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_y[start:end].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_y[start:end] = y[buckets[j]]
+                    del fj_y
+                    # dense buckets
+                    fj_d = np.load(filename_j_d, mmap_mode="r+")
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_d[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_d[start:end, :] = X_int[buckets[j], :]
+                    del fj_d
+                    # sparse buckets
+                    fj_s = np.load(filename_j_s, mmap_mode="r+")
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_s[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_s[start:end, :] = X_cat[buckets[j], :]
+                    del fj_s
+                    # update counters for next step
+                    total_counter[j] += counter[j]
+
+        # 2nd pass of FYR shuffle
+        # check if data already exists
+        for j in range(days):
+            filename_j = npzfile + "_{0}_reordered.npz".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # reorder within buckets
+        if recreate_flag:
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                fj_y = np.load(filename_j_y)
+                fj_d = np.load(filename_j_d)
+                fj_s = np.load(filename_j_s)
+
+                indices = range(total_per_file[j])
+                if randomize == "day" or randomize == "total":
+                    if data_split == "none" or j < days - 1:
+                        indices = np.random.permutation(range(total_per_file[j]))
+
+                filename_r = npzfile + "_{0}_reordered.npz".format(j)
+                print("Reordering (2nd pass) " + filename_r)
+                np.savez_compressed(
+                    filename_r,
+                    X_cat=fj_s[indices, :],
+                    X_int=fj_d[indices, :],
+                    y=fj_y[indices],
+                )
+
+        """
+        # sanity check (under no reordering norms should be zero)
+        for i in range(days):
+            filename_i_o = npzfile + "_{0}_processed.npz".format(i)
+            print(filename_i_o)
+            with np.load(filename_i_o) as data_original:
+                X_cat_o = data_original["X_cat"]
+                X_int_o = data_original["X_int"]
+                y_o = data_original["y"]
+            filename_i_r = npzfile + "_{0}_reordered.npz".format(i)
+            print(filename_i_r)
+            with np.load(filename_i_r) as data_reordered:
+                X_cat_r = data_reordered["X_cat"]
+                X_int_r = data_reordered["X_int"]
+                y_r = data_reordered["y"]
+            print(np.linalg.norm(y_o - y_r))
+            print(np.linalg.norm(X_int_o - X_int_r))
+            print(np.linalg.norm(X_cat_o - X_cat_r))
+        """
+
+    else:
+        print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename))
+
+        # load and concatenate data
+        for i in range(days):
+            filename_i = npzfile + "_{0}_processed.npz".format(i)
+            with np.load(filename_i) as data:
+                if i == 0:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                else:
+                    X_cat = np.concatenate((X_cat, data["X_cat"]))
+                    X_int = np.concatenate((X_int, data["X_int"]))
+                    y = np.concatenate((y, data["y"]))
+            print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0]))
+
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+        print("Loaded counts!")
+
+        np.savez_compressed(
+            d_path + o_filename + ".npz",
+            X_cat=X_cat,
+            X_int=X_int,
+            y=y,
+            counts=counts,
+        )
+
+    return d_path + o_filename + ".npz"
+
+
+def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
+    # Transforms Criteo Kaggle or terabyte data by applying log transformation
+    # on dense features and converting everything to appropriate tensors.
+    #
+    # Inputs:
+    #     X_cat (ndarray): array of integers corresponding to preprocessed
+    #                      categorical features
+    #     X_int (ndarray): array of integers corresponding to dense features
+    #     y (ndarray):     array of bool corresponding to labels
+    #     data_split(str): flag for splitting dataset into training/validation/test
+    #                      sets
+    #     randomize (str): determines randomization scheme
+    #         "none": no randomization
+    #         "day": randomizes each day"s data (only works if split = True)
+    #         "total": randomizes total dataset
+    #
+    # Outputs:
+    #     if split:
+    #         X_cat_train (tensor): sparse features for training set
+    #         X_int_train (tensor): dense features for training set
+    #         y_train (tensor): labels for training set
+    #         X_cat_val (tensor): sparse features for validation set
+    #         X_int_val (tensor): dense features for validation set
+    #         y_val (tensor): labels for validation set
+    #         X_cat_test (tensor): sparse features for test set
+    #         X_int_test (tensor): dense features for test set
+    #         y_test (tensor): labels for test set
+    #     else:
+    #         X_cat (tensor): sparse features
+    #         X_int (tensor): dense features
+    #         y (tensor): label
+
+    # define initial set of indices
+    indices = np.arange(len(y))
+
+    # create offset per file
+    offset_per_file = np.array([0] + [x for x in total_per_file])
+    for i in range(days):
+        offset_per_file[i + 1] += offset_per_file[i]
+
+    # split dataset
+    if data_split == "train":
+        indices = np.array_split(indices, offset_per_file[1:-1])
+
+        # randomize train data (per day)
+        if randomize == "day":  # or randomize == "total":
+            for i in range(len(indices) - 1):
+                indices[i] = np.random.permutation(indices[i])
+            print("Randomized indices per day ...")
+
+        train_indices = np.concatenate(indices[:-1])
+        test_indices = indices[-1]
+        test_indices, val_indices = np.array_split(test_indices, 2)
+
+        print("Defined training and testing indices...")
+
+        # randomize train data (across days)
+        if randomize == "total":
+            train_indices = np.random.permutation(train_indices)
+            print("Randomized indices across days ...")
+
+        # indices = np.concatenate((train_indices, test_indices))
+
+        # create training, validation, and test sets
+        X_cat_train = X_cat[train_indices]
+        X_int_train = X_int[train_indices]
+        y_train = y[train_indices]
+
+        X_cat_val = X_cat[val_indices]
+        X_int_val = X_int[val_indices]
+        y_val = y[val_indices]
+
+        X_cat_test = X_cat[test_indices]
+        X_int_test = X_int[test_indices]
+        y_test = y[test_indices]
+
+        print("Split data according to indices...")
+
+        X_cat_train = X_cat_train.astype(int)
+        X_int_train = np.log(X_int_train.astype(np.float32) + 1)
+        y_train = y_train.astype(np.float32)
+
+        X_cat_val = X_cat_val.astype(int)
+        X_int_val = np.log(X_int_val.astype(np.float32) + 1)
+        y_val = y_val.astype(np.float32)
+
+        X_cat_test = X_cat_test.astype(int)
+        X_int_test = np.log(X_int_test.astype(np.float32) + 1)
+        y_test = y_test.astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (
+            X_cat_train,
+            X_int_train,
+            y_train,
+            X_cat_val,
+            X_int_val,
+            y_val,
+            X_cat_test,
+            X_int_test,
+            y_test,
+        )
+
+    else:
+        # randomize data
+        if randomize == "total":
+            indices = np.random.permutation(indices)
+            print("Randomized indices...")
+
+        X_cat = X_cat[indices].astype(int)
+        X_int = np.log(X_int[indices].astype(np.float32) + 1)
+        y = y[indices].astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (X_cat, X_int, y, [], [], [], [], [], [])
+
+
+def getCriteoAdData(
+    datafile,
+    o_filename,
+    max_ind_range=-1,
+    sub_sample_rate=0.0,
+    days=7,
+    data_split="train",
+    randomize="total",
+    criteo_kaggle=True,
+    memory_map=False,
+    dataset_multiprocessing=False,
+):
+    # Passes through entire dataset and defines dictionaries for categorical
+    # features and determines the number of total categories.
+    #
+    # Inputs:
+    #    datafile : path to downloaded raw data file
+    #    o_filename (str): saves results under o_filename if filename is not ""
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    # split the datafile into path and filename
+    lstr = datafile.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1]
+    npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file)
+    trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea")
+
+    # count number of datapoints in training set
+    total_file = d_path + d_file + "_day_count.npz"
+    if path.exists(total_file):
+        with np.load(total_file) as data:
+            total_per_file = list(data["total_per_file"])
+        total_count = np.sum(total_per_file)
+        print("Skipping counts per file (already exist)")
+    else:
+        total_count = 0
+        total_per_file = []
+        if criteo_kaggle:
+            # WARNING: The raw data consists of a single train.txt file
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            if path.exists(datafile):
+                print("Reading data from path=%s" % (datafile))
+                with open(str(datafile)) as f:
+                    for _ in f:
+                        total_count += 1
+                total_per_file.append(total_count)
+                # reset total per file due to split
+                num_data_per_split, extras = divmod(total_count, days)
+                total_per_file = [num_data_per_split] * days
+                for j in range(extras):
+                    total_per_file[j] += 1
+                # split into days (simplifies code later on)
+                file_id = 0
+                boundary = total_per_file[file_id]
+                nf = open(npzfile + "_" + str(file_id), "w")
+                with open(str(datafile)) as f:
+                    for j, line in enumerate(f):
+                        if j == boundary:
+                            nf.close()
+                            file_id += 1
+                            nf = open(npzfile + "_" + str(file_id), "w")
+                            boundary += total_per_file[file_id]
+                        nf.write(line)
+                nf.close()
+            else:
+                sys.exit(
+                    "ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset"
+                )
+        else:
+            # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            for i in range(days):
+                datafile_i = datafile + "_" + str(i)  # + ".gz"
+                if path.exists(str(datafile_i)):
+                    print("Reading data from path=%s" % (str(datafile_i)))
+                    # file day_<number>
+                    total_per_file_count = 0
+                    with open(str(datafile_i)) as f:
+                        for _ in f:
+                            total_per_file_count += 1
+                    total_per_file.append(total_per_file_count)
+                    total_count += total_per_file_count
+                else:
+                    sys.exit(
+                        "ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs"
+                    )
+
+    # process a file worth of data and reinitialize data
+    # note that a file main contain a single or multiple splits
+    def process_one_file(
+        datfile,
+        npzfile,
+        split,
+        num_data_in_split,
+        dataset_multiprocessing,
+        convertDictsDay=None,
+        resultDay=None,
+    ):
+        if dataset_multiprocessing:
+            convertDicts_day = [{} for _ in range(26)]
+
+        with open(str(datfile)) as f:
+            y = np.zeros(num_data_in_split, dtype="i4")  # 4 byte int
+            X_int = np.zeros((num_data_in_split, 13), dtype="i4")  # 4 byte int
+            X_cat = np.zeros((num_data_in_split, 26), dtype="i4")  # 4 byte int
+            if sub_sample_rate == 0.0:
+                rand_u = 1.0
+            else:
+                rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split)
+
+            i = 0
+            percent = 0
+            for k, line in enumerate(f):
+                # process a line (data point)
+                line = line.split("\t")
+                # set missing values to zero
+                for j in range(len(line)):
+                    if (line[j] == "") or (line[j] == "\n"):
+                        line[j] = "0"
+                # sub-sample data by dropping zero targets, if needed
+                target = np.int32(line[0])
+                if (
+                    target == 0
+                    and (rand_u if sub_sample_rate == 0.0 else rand_u[k])
+                    < sub_sample_rate
+                ):
+                    continue
+
+                y[i] = target
+                X_int[i] = np.array(line[1:14], dtype=np.int32)
+                if max_ind_range > 0:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
+                        dtype=np.int32,
+                    )
+                else:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16), line[14:])), dtype=np.int32
+                    )
+
+                # count uniques
+                if dataset_multiprocessing:
+                    for j in range(26):
+                        convertDicts_day[j][X_cat[i][j]] = 1
+                    # debug prints
+                    if float(i) / num_data_in_split * 100 > percent + 1:
+                        percent = int(float(i) / num_data_in_split * 100)
+                        print(
+                            "Load %d/%d (%d%%) Split: %d  Label True: %d  Stored: %d"
+                            % (
+                                i,
+                                num_data_in_split,
+                                percent,
+                                split,
+                                target,
+                                y[i],
+                            ),
+                            end="\n",
+                        )
+                else:
+                    for j in range(26):
+                        convertDicts[j][X_cat[i][j]] = 1
+                    # debug prints
+                    print(
+                        "Load %d/%d  Split: %d  Label True: %d  Stored: %d"
+                        % (
+                            i,
+                            num_data_in_split,
+                            split,
+                            target,
+                            y[i],
+                        ),
+                        end="\r",
+                    )
+                i += 1
+
+            # store num_data_in_split samples or extras at the end of file
+            # count uniques
+            # X_cat_t  = np.transpose(X_cat)
+            # for j in range(26):
+            #     for x in X_cat_t[j,:]:
+            #         convertDicts[j][x] = 1
+            # store parsed
+            filename_s = npzfile + "_{0}.npz".format(split)
+            if path.exists(filename_s):
+                print("\nSkip existing " + filename_s)
+            else:
+                np.savez_compressed(
+                    filename_s,
+                    X_int=X_int[0:i, :],
+                    # X_cat=X_cat[0:i, :],
+                    X_cat_t=np.transpose(X_cat[0:i, :]),  # transpose of the data
+                    y=y[0:i],
+                )
+                print("\nSaved " + npzfile + "_{0}.npz!".format(split))
+
+        if dataset_multiprocessing:
+            resultDay[split] = i
+            convertDictsDay[split] = convertDicts_day
+            return
+        else:
+            return i
+
+    # create all splits (reuse existing files if possible)
+    recreate_flag = False
+    convertDicts = [{} for _ in range(26)]
+    # WARNING: to get reproducable sub-sampling results you must reset the seed below
+    # np.random.seed(123)
+    # in this case there is a single split in each day
+    for i in range(days):
+        npzfile_i = npzfile + "_{0}.npz".format(i)
+        npzfile_p = npzfile + "_{0}_processed.npz".format(i)
+        if path.exists(npzfile_i):
+            print("Skip existing " + npzfile_i)
+        elif path.exists(npzfile_p):
+            print("Skip existing " + npzfile_p)
+        else:
+            recreate_flag = True
+
+    if recreate_flag:
+        if dataset_multiprocessing:
+            resultDay = Manager().dict()
+            convertDictsDay = Manager().dict()
+            processes = [
+                Process(
+                    target=process_one_file,
+                    name="process_one_file:%i" % i,
+                    args=(
+                        npzfile + "_{0}".format(i),
+                        npzfile,
+                        i,
+                        total_per_file[i],
+                        dataset_multiprocessing,
+                        convertDictsDay,
+                        resultDay,
+                    ),
+                )
+                for i in range(0, days)
+            ]
+            for process in processes:
+                process.start()
+            for process in processes:
+                process.join()
+            for day in range(days):
+                total_per_file[day] = resultDay[day]
+                print("Constructing convertDicts Split: {}".format(day))
+                convertDicts_tmp = convertDictsDay[day]
+                for i in range(26):
+                    for j in convertDicts_tmp[i]:
+                        convertDicts[i][j] = 1
+        else:
+            for i in range(days):
+                total_per_file[i] = process_one_file(
+                    npzfile + "_{0}".format(i),
+                    npzfile,
+                    i,
+                    total_per_file[i],
+                    dataset_multiprocessing,
+                )
+
+    # report and save total into a file
+    total_count = np.sum(total_per_file)
+    if not path.exists(total_file):
+        np.savez_compressed(total_file, total_per_file=total_per_file)
+    print("Total number of samples:", total_count)
+    print("Divided into days/splits:\n", total_per_file)
+
+    # dictionary files
+    counts = np.zeros(26, dtype=np.int32)
+    if recreate_flag:
+        # create dictionaries
+        for j in range(26):
+            for i, x in enumerate(convertDicts[j]):
+                convertDicts[j][x] = i
+            dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j)
+            if not path.exists(dict_file_j):
+                np.savez_compressed(
+                    dict_file_j, unique=np.array(list(convertDicts[j]), dtype=np.int32)
+                )
+            counts[j] = len(convertDicts[j])
+        # store (uniques and) counts
+        count_file = d_path + d_file + "_fea_count.npz"
+        if not path.exists(count_file):
+            np.savez_compressed(count_file, counts=counts)
+    else:
+        # create dictionaries (from existing files)
+        for j in range(26):
+            with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data:
+                unique = data["unique"]
+            for i, x in enumerate(unique):
+                convertDicts[j][x] = i
+        # load (uniques and) counts
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+
+    # process all splits
+    if dataset_multiprocessing:
+        processes = [
+            Process(
+                target=processCriteoAdData,
+                name="processCriteoAdData:%i" % i,
+                args=(
+                    d_path,
+                    d_file,
+                    npzfile,
+                    i,
+                    convertDicts,
+                    counts,
+                ),
+            )
+            for i in range(0, days)
+        ]
+        for process in processes:
+            process.start()
+        for process in processes:
+            process.join()
+
+    else:
+        for i in range(days):
+            processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts)
+
+    o_file = concatCriteoAdData(
+        d_path,
+        d_file,
+        npzfile,
+        trafile,
+        days,
+        data_split,
+        randomize,
+        total_per_file,
+        total_count,
+        memory_map,
+        o_filename,
+    )
+
+    return o_file
+
+
+def loadDataset(
+    dataset,
+    max_ind_range,
+    sub_sample_rate,
+    randomize,
+    data_split,
+    raw_path="",
+    pro_data="",
+    memory_map=False,
+):
+    # dataset
+    if dataset == "kaggle":
+        days = 7
+        o_filename = "kaggleAdDisplayChallenge_processed"
+    elif dataset == "terabyte":
+        days = 24
+        o_filename = "terabyte_processed"
+    else:
+        raise (ValueError("Data set option is not supported"))
+
+    # split the datafile into path and filename
+    lstr = raw_path.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+    npzfile = (d_file + "_day") if dataset == "kaggle" else d_file
+    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
+
+    # check if pre-processed data is available
+    data_ready = True
+    if memory_map:
+        for i in range(days):
+            reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if not path.exists(str(reo_data)):
+                data_ready = False
+    else:
+        if not path.exists(str(pro_data)):
+            data_ready = False
+
+    # pre-process data if needed
+    # WARNNING: when memory mapping is used we get a collection of files
+    if data_ready:
+        print("Reading pre-processed data=%s" % (str(pro_data)))
+        file = str(pro_data)
+    else:
+        print("Reading raw data=%s" % (str(raw_path)))
+        file = getCriteoAdData(
+            raw_path,
+            o_filename,
+            max_ind_range,
+            sub_sample_rate,
+            days,
+            data_split,
+            randomize,
+            dataset == "kaggle",
+            memory_map,
+        )
+
+    return file, days
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Preprocess Criteo dataset")
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+    )
--- a/dlrm_data_caffe2.py
+++ b/dlrm_data_caffe2.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: generate inputs and targets for the dlrm benchmark
+# The inpts and outputs are generated according to the following three option(s)
+# 1) random distribution
+# 2) synthetic distribution, based on unique accesses and distances between them
+#    i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
+#    Simulation of Cache Memory", IEEE AINAM'07
+# 3) public data set
+#    i)  Criteo Kaggle Display Advertising Challenge Dataset
+#    https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#    ii) Criteo Terabyte Dataset
+#    https://labs.criteo.com/2013/12/download-terabyte-click-logs
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import bisect
+import collections
+
+# others
+# from os import path
+import sys
+
+import data_utils
+
+# numpy
+import numpy as np
+
+# pytorch
+import torch
+from numpy import random as ra
+from torch.utils.data import Dataset
+
+
+# Kaggle Display Advertising Challenge Dataset
+# dataset (str): name of dataset (Kaggle or Terabyte)
+# randomize (str): determines randomization scheme
+#            'none': no randomization
+#            'day': randomizes each day's data (only works if split = True)
+#            'total': randomizes total dataset
+# split (bool) : to split into train, test, validation data-sets
+
+
+class CriteoDatasetWMemoryMap(Dataset):
+    def __init__(
+        self,
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        split="train",
+        raw_path="",
+        pro_data="",
+    ):
+        # dataset
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        # spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        if dataset == "kaggle":
+            days = 7
+        elif dataset == "terabyte":
+            days = 24
+        else:
+            raise (ValueError("Data set option is not supported"))
+        self.max_ind_range = max_ind_range
+
+        # split the datafile into path and filename
+        lstr = raw_path.split("/")
+        self.d_path = "/".join(lstr[0:-1]) + "/"
+        self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+        self.npzfile = self.d_path + (
+            (self.d_file + "_day") if dataset == "kaggle" else self.d_file
+        )
+        self.trafile = self.d_path + (
+            (self.d_file + "_fea") if dataset == "kaggle" else "fea"
+        )
+
+        # get a number of samples per day
+        total_file = self.d_path + self.d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+        # compute offsets per file
+        self.offset_per_file = np.array([0] + list(total_per_file))
+        for i in range(days):
+            self.offset_per_file[i + 1] += self.offset_per_file[i]
+        # print(self.offset_per_file)
+
+        # setup data
+        self.split = split
+        if split == "none" or split == "train":
+            self.day = 0
+            self.max_day_range = days if split == "none" else days - 1
+        elif split == "test" or split == "val":
+            self.day = days - 1
+            num_samples = self.offset_per_file[days] - self.offset_per_file[days - 1]
+            self.test_size = int(np.ceil(num_samples / 2.0))
+            self.val_size = num_samples - self.test_size
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+        # load unique counts
+        with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
+            self.counts = data["counts"]
+        self.m_den = den_fea  # X_int.shape[1]
+        self.n_emb = len(self.counts)
+        print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
+
+        # Load the test data
+        # Only a single day is used for testing
+        if self.split == "test" or self.split == "val":
+            # only a single day is used for testing
+            fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+            with np.load(fi) as data:
+                self.X_int = data["X_int"]  # continuous  feature
+                self.X_cat = data["X_cat"]  # categorical feature
+                self.y = data["y"]  # target
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self[idx]
+                for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+        if self.split == "none" or self.split == "train":
+            # check if need to swicth to next day and load data
+            if index == self.offset_per_file[self.day]:
+                # print("day_boundary switch", index)
+                self.day_boundary = self.offset_per_file[self.day]
+                fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+                # print('Loading file: ', fi)
+                with np.load(fi) as data:
+                    self.X_int = data["X_int"]  # continuous  feature
+                    self.X_cat = data["X_cat"]  # categorical feature
+                    self.y = data["y"]  # target
+                self.day = (self.day + 1) % self.max_day_range
+
+            i = index - self.day_boundary
+        elif self.split == "test" or self.split == "val":
+            # only a single day is used for testing
+            i = index + (0 if self.split == "test" else self.test_size)
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+        if self.max_ind_range > 0:
+            return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
+        else:
+            return self.X_int[i], self.X_cat[i], self.y[i]
+
+    def _default_preprocess(self, X_int, X_cat, y):
+        X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
+        if self.max_ind_range > 0:
+            X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
+        else:
+            X_cat = torch.tensor(X_cat, dtype=torch.long)
+        y = torch.tensor(y.astype(np.float32))
+
+        return X_int, X_cat, y
+
+    def __len__(self):
+        if self.split == "none":
+            return self.offset_per_file[-1]
+        elif self.split == "train":
+            return self.offset_per_file[-2]
+        elif self.split == "test":
+            return self.test_size
+        elif self.split == "val":
+            return self.val_size
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train nor test.")
+
+
+def collate_wrapper_criteo(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = [X_cat[:, i] for i in range(featureCnt)]
+    lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+
+    return X_int, torch.stack(lS_o), torch.stack(lS_i), T
+
+
+# Conversion from offset to length
+def offset_to_length_convertor(lS_o, lS_i):
+    def diff(tensor):
+        return tensor[1:] - tensor[:-1]
+
+    return torch.stack(
+        [
+            diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
+            for ind, S_o in enumerate(lS_o)
+        ]
+    )
+
+
+def unpack_batch(b, data_gen, data_set):
+    return b[0], b[1], b[2], b[3], torch.ones(b[3].size())
+
+
+def read_dataset(
+    dataset,
+    max_ind_range,
+    sub_sample_rate,
+    mini_batch_size,
+    num_batches,
+    randomize,
+    split="train",
+    raw_data="",
+    processed_data="",
+    memory_map=False,
+    inference_only=False,
+    test_mini_batch_size=1,
+):
+    # split the datafile into path and filename
+    lstr = raw_data.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+    # npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
+    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
+
+    # load
+    print("Loading %s dataset..." % dataset)
+    nbatches = 0
+    file, days = data_utils.loadDataset(
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        split,
+        raw_data,
+        processed_data,
+        memory_map,
+    )
+
+    if memory_map:
+        # WARNING: at this point the data has been reordered and shuffled across files
+        # e.g. day_<number>_reordered.npz, what remains is simply to read and feed
+        # the data from each file, going in the order of days file-by-file, to the
+        # model during training.
+        train_data = CriteoDatasetWMemoryMap(
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            "train",
+            raw_data,
+            processed_data,
+        )
+
+        test_data = CriteoDatasetWMemoryMap(
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            "test",
+            raw_data,
+            processed_data,
+        )
+
+        train_loader = torch.utils.data.DataLoader(
+            train_data,
+            batch_size=mini_batch_size,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=test_mini_batch_size,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        return train_data, train_loader, test_data, test_loader
+
+    else:
+        # load and preprocess data
+        with np.load(file) as data:
+            X_int = data["X_int"]
+            X_cat = data["X_cat"]
+            y = data["y"]
+            counts = data["counts"]
+
+        # get a number of samples per day
+        total_file = d_path + d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+
+        # transform
+        (
+            X_cat_train,
+            X_int_train,
+            y_train,
+            X_cat_val,
+            X_int_val,
+            y_val,
+            X_cat_test,
+            X_int_test,
+            y_test,
+        ) = data_utils.transformCriteoAdData(
+            X_cat, X_int, y, days, split, randomize, total_per_file
+        )
+        ln_emb = counts
+        m_den = X_int_train.shape[1]
+        n_emb = len(counts)
+        print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))
+
+        # adjust parameters
+        def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
+            if max_ind_range > 0:
+                X_cat = X_cat % max_ind_range
+
+            nsamples = len(y)
+            data_size = nsamples
+            # using floor is equivalent to dropping last mini-batch (drop_last = True)
+            nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
+            print(print_message)
+            if num_batches != 0 and num_batches < nbatches:
+                print(
+                    "Limiting to %d batches of the total % d batches"
+                    % (num_batches, nbatches)
+                )
+                nbatches = num_batches
+            else:
+                print("Total number of batches %d" % nbatches)
+
+            # data main loop
+            lX = []
+            lS_lengths = []
+            lS_indices = []
+            lT = []
+            for j in range(0, nbatches):
+                # number of data points in a batch
+                print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
+                n = min(mini_batch_size, data_size - (j * mini_batch_size))
+                # dense feature
+                idx_start = j * mini_batch_size
+                lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
+                # Targets - outputs
+                lT.append(
+                    (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
+                )
+                # sparse feature (sparse indices)
+                lS_emb_indices = []
+                # for each embedding generate a list of n lookups,
+                # where each lookup is composed of multiple sparse indices
+                for size in range(n_emb):
+                    lS_batch_indices = []
+                    for _b in range(n):
+                        # num of sparse indices to be used per embedding, e.g. for
+                        # store lengths and indices
+                        lS_batch_indices += (
+                            (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
+                        ).tolist()
+                    lS_emb_indices.append(lS_batch_indices)
+                lS_indices.append(lS_emb_indices)
+                # Criteo Kaggle data it is 1 because data is categorical
+                lS_lengths.append(
+                    [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
+                )
+            print("\n")
+
+            return nbatches, lX, lS_lengths, lS_indices, lT
+
+        # adjust training data
+        (nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples(
+            X_cat_train, X_int_train, y_train, max_ind_range, "Training data"
+        )
+
+        # adjust testing data
+        (nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples(
+            X_cat_test, X_int_test, y_test, max_ind_range, "Testing data"
+        )
+    # end if memory_map
+
+    return (
+        nbatches,
+        lX,
+        lS_lengths,
+        lS_indices,
+        lT,
+        nbatches_t,
+        lX_t,
+        lS_lengths_t,
+        lS_indices_t,
+        lT_t,
+        ln_emb,
+        m_den,
+    )
+
+
+def generate_random_data(
+    m_den,
+    ln_emb,
+    data_size,
+    num_batches,
+    mini_batch_size,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    num_targets=1,
+    round_targets=False,
+    data_generation="random",
+    trace_file="",
+    enable_padding=False,
+):
+    nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+    if num_batches != 0:
+        nbatches = num_batches
+        data_size = nbatches * mini_batch_size
+    # print("Total number of batches %d" % nbatches)
+
+    # inputs and targets
+    lT = []
+    lX = []
+    lS_lengths = []
+    lS_indices = []
+    for j in range(0, nbatches):
+        # number of data points in a batch
+        n = min(mini_batch_size, data_size - (j * mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if data_generation == "random":
+            (Xt, lS_emb_lengths, lS_emb_indices) = generate_uniform_input_batch(
+                m_den, ln_emb, n, num_indices_per_lookup, num_indices_per_lookup_fixed
+            )
+        elif data_generation == "synthetic":
+            (Xt, lS_emb_lengths, lS_emb_indices) = generate_synthetic_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                trace_file,
+                enable_padding,
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + data_generation + " is not supported"
+            )
+        # dense feature
+        lX.append(Xt)
+        # sparse feature (sparse indices)
+        lS_lengths.append(lS_emb_lengths)
+        lS_indices.append(lS_emb_indices)
+
+        # generate a batch of target (probability of a click)
+        P = generate_random_output_batch(n, num_targets, round_targets)
+        lT.append(P)
+
+    return (nbatches, lX, lS_lengths, lS_indices, lT)
+
+
+def generate_random_output_batch(n, num_targets=1, round_targets=False):
+    # target (probability of a click)
+    if round_targets:
+        P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.int32)
+    else:
+        P = ra.rand(n, num_targets).astype(np.float32)
+
+    return P
+
+
+# uniform ditribution (input data)
+def generate_uniform_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+):
+    # dense feature
+    Xt = ra.rand(n, m_den).astype(np.float32)
+
+    # sparse feature (sparse indices)
+    lS_emb_lengths = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_lengths = []
+        lS_batch_indices = []
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int32(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int32(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            r = ra.random(sparse_group_size)
+            sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int32))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            lS_batch_lengths += [sparse_group_size]
+            lS_batch_indices += sparse_group.tolist()
+        lS_emb_lengths.append(lS_batch_lengths)
+        lS_emb_indices.append(lS_batch_indices)
+
+    return (Xt, lS_emb_lengths, lS_emb_indices)
+
+
+# synthetic distribution (input data)
+def generate_synthetic_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    trace_file,
+    enable_padding=False,
+):
+    # dense feature
+    Xt = ra.rand(n, m_den).astype(np.float32)
+
+    # sparse feature (sparse indices)
+    lS_emb_lengths = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for i, size in enumerate(ln_emb):
+        lS_batch_lengths = []
+        lS_batch_indices = []
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int32(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int32(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            file_path = trace_file
+            line_accesses, list_sd, cumm_sd = read_dist_from_file(
+                file_path.replace("j", str(i))
+            )
+            # debug print
+            # print('input')
+            # print(line_accesses); print(list_sd); print(cumm_sd);
+            # print(sparse_group_size)
+            # approach 1: rand
+            # r = trace_generate_rand(
+            #     line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            # )
+            # approach 2: lru
+            r = trace_generate_lru(
+                line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            )
+            # WARNING: if the distribution in the file is not consistent with
+            # embedding table dimensions, below mod guards against out of
+            # range access
+            sparse_group = np.unique(r).astype(np.int32)
+            minsg = np.min(sparse_group)
+            maxsg = np.max(sparse_group)
+            if (minsg < 0) or (size <= maxsg):
+                print(
+                    "WARNING: distribution is inconsistent with embedding "
+                    + "table size (using mod to recover and continue)"
+                )
+                sparse_group = np.mod(sparse_group, size).astype(np.int32)
+            # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int32))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            lS_batch_lengths += [sparse_group_size]
+            lS_batch_indices += sparse_group.tolist()
+        lS_emb_lengths.append(lS_batch_lengths)
+        lS_emb_indices.append(lS_batch_indices)
+
+    return (Xt, lS_emb_lengths, lS_emb_indices)
+
+
+def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
+    u = ra.rand(1)
+    if i < max_i:
+        # only generate stack distances up to the number of new references seen so far
+        j = bisect.bisect(cumm_val, i) - 1
+        fi = cumm_dist[j]
+        u *= fi  # shrink distribution support to exclude last values
+    elif enable_padding:
+        # WARNING: disable generation of new references (once all have been seen)
+        fi = cumm_dist[0]
+        u = (1.0 - fi) * u + fi  # remap distribution support to exclude first value
+
+    for j, f in enumerate(cumm_dist):
+        if u <= f:
+            return cumm_val[j]
+
+
+# WARNING: global define, must be consistent across all synthetic functions
+cache_line_size = 1
+
+
+def trace_generate_lru(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            line_accesses.pop(l - sd)
+            line_accesses.append(line_ref)
+        # save generated memory reference
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_generate_rand(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)  # !!!Unique,
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_profile(trace, enable_padding=False):
+    # number of elements in the array (assuming 1D)
+    # n = trace.size
+
+    rstack = []  # S
+    stack_distances = []  # SDS
+    line_accesses = []  # L
+    for x in trace:
+        r = np.uint64(x / cache_line_size)
+        l = len(rstack)
+        try:  # found #
+            i = rstack.index(r)
+            # WARNING: I believe below is the correct depth in terms of meaning of the
+            #          algorithm, but that is not what seems to be in the paper alg.
+            #          -1 can be subtracted if we defined the distance between
+            #          consecutive accesses (e.g. r, r) as 0 rather than 1.
+            sd = l - i  # - 1
+            # push r to the end of stack_distances
+            stack_distances.insert(0, sd)
+            # remove r from its position and insert to the top of stack
+            rstack.pop(i)  # rstack.remove(r)
+            rstack.insert(l - 1, r)
+        except ValueError:  # not found #
+            sd = 0  # -1
+            # push r to the end of stack_distances/line_accesses
+            stack_distances.insert(0, sd)
+            line_accesses.insert(0, r)
+            # push r to the top of stack
+            rstack.insert(l, r)
+
+    if enable_padding:
+        # WARNING: notice that as the ratio between the number of samples (l)
+        # and cardinality [c] of a sample increases the probability of
+        # generating a sample gets smaller and smaller because there are
+        # few new samples compared to repeated samples. This means that for a
+        # long trace with relatively small cardinality it will take longer to
+        # generate all new samples and therefore obtain full distribution support
+        # and hence it takes longer for distribution to resemble the original.
+        # Therefore, we may pad the number of new samples to be on par with
+        # average number of samples l/c artificially.
+        l = len(stack_distances)
+        c = max(stack_distances)
+        padding = int(np.ceil(l / c))
+        stack_distances = stack_distances + [0] * padding
+
+    return (rstack, stack_distances, line_accesses)
+
+
+# auxiliary read/write routines
+def read_trace_from_file(file_path):
+    try:
+        with open(file_path) as f:
+            if args.trace_file_binary_type:
+                array = np.fromfile(f, dtype=np.uint64)
+                trace = array.astype(np.uint64).tolist()
+            else:
+                line = f.readline()
+                trace = list(map(lambda x: np.uint64(x), line.split(", ")))
+            return trace
+    except Exception:
+        print("ERROR: no input trace file has been provided")
+
+
+def write_trace_to_file(file_path, trace):
+    try:
+        if args.trace_file_binary_type:
+            with open(file_path, "wb+") as f:
+                np.array(trace).astype(np.uint64).tofile(f)
+        else:
+            with open(file_path, "w+") as f:
+                s = str(trace)
+                f.write(s[1 : len(s) - 1])
+    except Exception:
+        print("ERROR: no output trace file has been provided")
+
+
+def read_dist_from_file(file_path):
+    try:
+        with open(file_path, "r") as f:
+            lines = f.read().splitlines()
+    except Exception:
+        print("Wrong file or file path")
+    # read unique accesses
+    unique_accesses = [int(el) for el in lines[0].split(", ")]
+    # read cumulative distribution (elements are passed as two separate lists)
+    list_sd = [int(el) for el in lines[1].split(", ")]
+    cumm_sd = [float(el) for el in lines[2].split(", ")]
+
+    return unique_accesses, list_sd, cumm_sd
+
+
+def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
+    try:
+        with open(file_path, "w") as f:
+            # unique_acesses
+            s = str(unique_accesses)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # list_sd
+            s = str(list_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # cumm_sd
+            s = str(cumm_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+    except Exception:
+        print("Wrong file or file path")
+
+
+if __name__ == "__main__":
+    import argparse
+    import operator
+    import sys
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
+    parser.add_argument("--trace-file", type=str, default="./input/trace.log")
+    parser.add_argument("--trace-file-binary-type", type=bool, default=False)
+    parser.add_argument("--trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--dist-file", type=str, default="./input/dist.log")
+    parser.add_argument(
+        "--synthetic-file", type=str, default="./input/trace_synthetic.log"
+    )
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--print-precision", type=int, default=5)
+    args = parser.parse_args()
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+
+    ### read trace ###
+    trace = read_trace_from_file(args.trace_file)
+    # print(trace)
+
+    ### profile trace ###
+    (_, stack_distances, line_accesses) = trace_profile(
+        trace, args.trace_enable_padding
+    )
+    stack_distances.reverse()
+    line_accesses.reverse()
+    # print(line_accesses)
+    # print(stack_distances)
+
+    ### compute probability distribution ###
+    # count items
+    l = len(stack_distances)
+    dc = sorted(
+        collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
+    )
+
+    # create a distribution
+    list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc))  # x = tuple_x_k[0]
+    dist_sd = list(
+        map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
+    )  # k = tuple_x_k[1]
+    cumm_sd = []  # np.cumsum(dc).tolist() #prefixsum
+    for i, (_, k) in enumerate(dc):
+        if i == 0:
+            cumm_sd.append(k / float(l))
+        else:
+            # add the 2nd element of the i-th tuple in the dist_sd list
+            cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
+
+    ### write stack_distance and line_accesses to a file ###
+    write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
+
+    ### generate correspondinf synthetic ###
+    # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
+    synthetic_trace = trace_generate_lru(
+        line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    )
+    # synthetic_trace = trace_generate_rand(
+    #     line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    # )
+    write_trace_to_file(args.synthetic_file, synthetic_trace)
--- a/dlrm_data_pytorch.py
+++ b/dlrm_data_pytorch.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: generate inputs and targets for the dlrm benchmark
+# The inputs and outputs are generated according to the following three option(s)
+# 1) random distribution
+# 2) synthetic distribution, based on unique accesses and distances between them
+#    i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
+#    Simulation of Cache Memory", IEEE AINAM'07
+# 3) public data set
+#    i)  Criteo Kaggle Display Advertising Challenge Dataset
+#    https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#    ii) Criteo Terabyte Dataset
+#    https://labs.criteo.com/2013/12/download-terabyte-click-logs
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import bisect
+import collections
+import sys
+from collections import deque
+
+# others
+from os import path
+
+import data_loader_terabyte
+
+import data_utils
+import mlperf_logger
+
+# numpy
+import numpy as np
+
+# pytorch
+import torch
+from numpy import random as ra
+from torch.utils.data import Dataset, RandomSampler
+
+
+# Kaggle Display Advertising Challenge Dataset
+# dataset (str): name of dataset (Kaggle or Terabyte)
+# randomize (str): determines randomization scheme
+#            "none": no randomization
+#            "day": randomizes each day"s data (only works if split = True)
+#            "total": randomizes total dataset
+# split (bool) : to split into train, test, validation data-sets
+class CriteoDataset(Dataset):
+    def __init__(
+        self,
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        split="train",
+        raw_path="",
+        pro_data="",
+        memory_map=False,
+        dataset_multiprocessing=False,
+    ):
+        # dataset
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        # spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        if dataset == "kaggle":
+            days = 7
+            out_file = "kaggleAdDisplayChallenge_processed"
+        elif dataset == "terabyte":
+            days = 24
+            out_file = "terabyte_processed"
+        else:
+            raise (ValueError("Data set option is not supported"))
+        self.max_ind_range = max_ind_range
+        self.memory_map = memory_map
+
+        # split the datafile into path and filename
+        lstr = raw_path.split("/")
+        self.d_path = "/".join(lstr[0:-1]) + "/"
+        self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+        self.npzfile = self.d_path + (
+            (self.d_file + "_day") if dataset == "kaggle" else self.d_file
+        )
+        self.trafile = self.d_path + (
+            (self.d_file + "_fea") if dataset == "kaggle" else "fea"
+        )
+
+        # check if pre-processed data is available
+        data_ready = True
+        if memory_map:
+            for i in range(days):
+                reo_data = self.npzfile + "_{0}_reordered.npz".format(i)
+                if not path.exists(str(reo_data)):
+                    data_ready = False
+        else:
+            if not path.exists(str(pro_data)):
+                data_ready = False
+
+        # pre-process data if needed
+        # WARNNING: when memory mapping is used we get a collection of files
+        if data_ready:
+            print("Reading pre-processed data=%s" % (str(pro_data)))
+            file = str(pro_data)
+        else:
+            print("Reading raw data=%s" % (str(raw_path)))
+            file = data_utils.getCriteoAdData(
+                raw_path,
+                out_file,
+                max_ind_range,
+                sub_sample_rate,
+                days,
+                split,
+                randomize,
+                dataset == "kaggle",
+                memory_map,
+                dataset_multiprocessing,
+            )
+
+        # get a number of samples per day
+        total_file = self.d_path + self.d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+        # compute offsets per file
+        self.offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            self.offset_per_file[i + 1] += self.offset_per_file[i]
+        # print(self.offset_per_file)
+
+        # setup data
+        if memory_map:
+            # setup the training/testing split
+            self.split = split
+            if split == "none" or split == "train":
+                self.day = 0
+                self.max_day_range = days if split == "none" else days - 1
+            elif split == "test" or split == "val":
+                self.day = days - 1
+                num_samples = (
+                    self.offset_per_file[days] - self.offset_per_file[days - 1]
+                )
+                self.test_size = int(np.ceil(num_samples / 2.0))
+                self.val_size = num_samples - self.test_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+            """
+            # text
+            print("text")
+            for i in range(days):
+                fi = self.npzfile + "_{0}".format(i)
+                with open(fi) as data:
+                    ttt = 0; nnn = 0
+                    for _j, line in enumerate(data):
+                        ttt +=1
+                        if np.int32(line[0]) > 0:
+                            nnn +=1
+                    print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                          + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # processed
+            print("processed")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_processed.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # reordered
+            print("reordered")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_reordered.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            """
+
+            # load unique counts
+            with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
+                self.counts = data["counts"]
+            self.m_den = den_fea  # X_int.shape[1]
+            self.n_emb = len(self.counts)
+            print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
+
+            # Load the test data
+            # Only a single day is used for testing
+            if self.split == "test" or self.split == "val":
+                # only a single day is used for testing
+                fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+                with np.load(fi) as data:
+                    self.X_int = data["X_int"]  # continuous  feature
+                    self.X_cat = data["X_cat"]  # categorical feature
+                    self.y = data["y"]  # target
+
+        else:
+            # load and preprocess data
+            with np.load(file) as data:
+                X_int = data["X_int"]  # continuous  feature
+                X_cat = data["X_cat"]  # categorical feature
+                y = data["y"]  # target
+                self.counts = data["counts"]
+            self.m_den = X_int.shape[1]  # den_fea
+            self.n_emb = len(self.counts)
+            print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den))
+
+            # create reordering
+            indices = np.arange(len(y))
+
+            if split == "none":
+                # randomize all data
+                if randomize == "total":
+                    indices = np.random.permutation(indices)
+                    print("Randomized indices...")
+
+                X_int[indices] = X_int
+                X_cat[indices] = X_cat
+                y[indices] = y
+
+            else:
+                indices = np.array_split(indices, self.offset_per_file[1:-1])
+
+                # randomize train data (per day)
+                if randomize == "day":  # or randomize == "total":
+                    for i in range(len(indices) - 1):
+                        indices[i] = np.random.permutation(indices[i])
+                    print("Randomized indices per day ...")
+
+                train_indices = np.concatenate(indices[:-1])
+                test_indices = indices[-1]
+                test_indices, val_indices = np.array_split(test_indices, 2)
+
+                print("Defined %s indices..." % (split))
+
+                # randomize train data (across days)
+                if randomize == "total":
+                    train_indices = np.random.permutation(train_indices)
+                    print("Randomized indices across days ...")
+
+                # create training, validation, and test sets
+                if split == "train":
+                    self.X_int = [X_int[i] for i in train_indices]
+                    self.X_cat = [X_cat[i] for i in train_indices]
+                    self.y = [y[i] for i in train_indices]
+                elif split == "val":
+                    self.X_int = [X_int[i] for i in val_indices]
+                    self.X_cat = [X_cat[i] for i in val_indices]
+                    self.y = [y[i] for i in val_indices]
+                elif split == "test":
+                    self.X_int = [X_int[i] for i in test_indices]
+                    self.X_cat = [X_cat[i] for i in test_indices]
+                    self.y = [y[i] for i in test_indices]
+
+            print("Split data according to indices...")
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self[idx]
+                for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        if self.memory_map:
+            if self.split == "none" or self.split == "train":
+                # check if need to swicth to next day and load data
+                if index == self.offset_per_file[self.day]:
+                    # print("day_boundary switch", index)
+                    self.day_boundary = self.offset_per_file[self.day]
+                    fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+                    # print('Loading file: ', fi)
+                    with np.load(fi) as data:
+                        self.X_int = data["X_int"]  # continuous  feature
+                        self.X_cat = data["X_cat"]  # categorical feature
+                        self.y = data["y"]  # target
+                    self.day = (self.day + 1) % self.max_day_range
+
+                i = index - self.day_boundary
+            elif self.split == "test" or self.split == "val":
+                # only a single day is used for testing
+                i = index + (0 if self.split == "test" else self.test_size)
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+        else:
+            i = index
+
+        if self.max_ind_range > 0:
+            return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
+        else:
+            return self.X_int[i], self.X_cat[i], self.y[i]
+
+    def _default_preprocess(self, X_int, X_cat, y):
+        X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
+        if self.max_ind_range > 0:
+            X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
+        else:
+            X_cat = torch.tensor(X_cat, dtype=torch.long)
+        y = torch.tensor(y.astype(np.float32))
+
+        return X_int, X_cat, y
+
+    def __len__(self):
+        if self.memory_map:
+            if self.split == "none":
+                return self.offset_per_file[-1]
+            elif self.split == "train":
+                return self.offset_per_file[-2]
+            elif self.split == "test":
+                return self.test_size
+            elif self.split == "val":
+                return self.val_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train nor test.")
+        else:
+            return len(self.y)
+
+
+def collate_wrapper_criteo_offset(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = [X_cat[:, i] for i in range(featureCnt)]
+    lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+
+    return X_int, torch.stack(lS_o), torch.stack(lS_i), T
+
+
+def ensure_dataset_preprocessed(args, d_path):
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing,
+    )
+
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "test",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing,
+    )
+
+    for split in ["train", "val", "test"]:
+        print("Running preprocessing for split =", split)
+
+        train_files = [
+            "{}_{}_reordered.npz".format(args.raw_data_file, day)
+            for day in range(0, 23)
+        ]
+
+        test_valid_file = args.raw_data_file + "_23_reordered.npz"
+
+        output_file = d_path + "_{}.bin".format(split)
+
+        input_files = train_files if split == "train" else [test_valid_file]
+        data_loader_terabyte.numpy_to_binary(
+            input_files=input_files, output_file_path=output_file, split=split
+        )
+
+
+# Conversion from offset to length
+def offset_to_length_converter(lS_o, lS_i):
+    def diff(tensor):
+        return tensor[1:] - tensor[:-1]
+
+    return torch.stack(
+        [
+            diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
+            for ind, S_o in enumerate(lS_o)
+        ]
+    )
+
+
+def collate_wrapper_criteo_length(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)])
+    lS_o = torch.stack([torch.tensor(range(batchSize)) for _ in range(featureCnt)])
+
+    lS_l = offset_to_length_converter(lS_o, lS_i)
+
+    return X_int, lS_l, lS_i, T
+
+
+def make_criteo_data_and_loaders(args, offset_to_length_converter=False):
+    if args.mlperf_logging and args.memory_map and args.data_set == "terabyte":
+        # more efficient for larger batches
+        data_directory = path.dirname(args.raw_data_file)
+
+        if args.mlperf_bin_loader:
+            lstr = args.processed_data_file.split("/")
+            d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
+            train_file = d_path + "_train.bin"
+            test_file = d_path + "_test.bin"
+            # val_file = d_path + "_val.bin"
+            counts_file = args.raw_data_file + "_fea_count.npz"
+
+            if any(not path.exists(p) for p in [train_file, test_file, counts_file]):
+                ensure_dataset_preprocessed(args, d_path)
+
+            train_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=train_file,
+                counts_file=counts_file,
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range,
+            )
+
+            mlperf_logger.log_event(
+                key=mlperf_logger.constants.TRAIN_SAMPLES, value=train_data.num_samples
+            )
+
+            train_loader = torch.utils.data.DataLoader(
+                train_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+                sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None,
+            )
+
+            test_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=test_file,
+                counts_file=counts_file,
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range,
+            )
+
+            mlperf_logger.log_event(
+                key=mlperf_logger.constants.EVAL_SAMPLES, value=test_data.num_samples
+            )
+
+            test_loader = torch.utils.data.DataLoader(
+                test_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+            )
+        else:
+            data_filename = args.raw_data_file.split("/")[-1]
+
+            train_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "train",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing,
+            )
+
+            test_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "test",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing,
+            )
+
+            train_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=list(range(23)),
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="train",
+            )
+
+            test_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=[23],
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="test",
+            )
+    else:
+        train_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "train",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        test_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "test",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        collate_wrapper_criteo = collate_wrapper_criteo_offset
+        if offset_to_length_converter:
+            collate_wrapper_criteo = collate_wrapper_criteo_length
+
+        train_loader = torch.utils.data.DataLoader(
+            train_data,
+            batch_size=args.mini_batch_size,
+            shuffle=False,
+            num_workers=args.num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=args.test_mini_batch_size,
+            shuffle=False,
+            num_workers=args.test_num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+    return train_data, train_loader, test_data, test_loader
+
+
+# uniform ditribution (input data)
+class RandomDataset(Dataset):
+    def __init__(
+        self,
+        m_den,
+        ln_emb,
+        data_size,
+        num_batches,
+        mini_batch_size,
+        num_indices_per_lookup,
+        num_indices_per_lookup_fixed,
+        num_targets=1,
+        round_targets=False,
+        data_generation="random",
+        trace_file="",
+        enable_padding=False,
+        reset_seed_on_access=False,
+        rand_data_dist="uniform",
+        rand_data_min=1,
+        rand_data_max=1,
+        rand_data_mu=-1,
+        rand_data_sigma=1,
+        rand_seed=0,
+    ):
+        # compute batch size
+        nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+        if num_batches != 0:
+            nbatches = num_batches
+            data_size = nbatches * mini_batch_size
+            # print("Total number of batches %d" % nbatches)
+
+        # save args (recompute data_size if needed)
+        self.m_den = m_den
+        self.ln_emb = ln_emb
+        self.data_size = data_size
+        self.num_batches = nbatches
+        self.mini_batch_size = mini_batch_size
+        self.num_indices_per_lookup = num_indices_per_lookup
+        self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed
+        self.num_targets = num_targets
+        self.round_targets = round_targets
+        self.data_generation = data_generation
+        self.trace_file = trace_file
+        self.enable_padding = enable_padding
+        self.reset_seed_on_access = reset_seed_on_access
+        self.rand_seed = rand_seed
+        self.rand_data_dist = rand_data_dist
+        self.rand_data_min = rand_data_min
+        self.rand_data_max = rand_data_max
+        self.rand_data_mu = rand_data_mu
+        self.rand_data_sigma = rand_data_sigma
+
+    def reset_numpy_seed(self, numpy_rand_seed):
+        np.random.seed(numpy_rand_seed)
+        # torch.manual_seed(numpy_rand_seed)
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self[idx]
+                for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        # WARNING: reset seed on access to first element
+        # (e.g. if same random samples needed across epochs)
+        if self.reset_seed_on_access and index == 0:
+            self.reset_numpy_seed(self.rand_seed)
+
+        # number of data points in a batch
+        n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if self.data_generation == "random":
+            (X, lS_o, lS_i) = generate_dist_input_batch(
+                self.m_den,
+                self.ln_emb,
+                n,
+                self.num_indices_per_lookup,
+                self.num_indices_per_lookup_fixed,
+                rand_data_dist=self.rand_data_dist,
+                rand_data_min=self.rand_data_min,
+                rand_data_max=self.rand_data_max,
+                rand_data_mu=self.rand_data_mu,
+                rand_data_sigma=self.rand_data_sigma,
+            )
+        elif self.data_generation == "synthetic":
+            (X, lS_o, lS_i) = generate_synthetic_input_batch(
+                self.m_den,
+                self.ln_emb,
+                n,
+                self.num_indices_per_lookup,
+                self.num_indices_per_lookup_fixed,
+                self.trace_file,
+                self.enable_padding,
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + self.data_generation + " is not supported"
+            )
+
+        # generate a batch of target (probability of a click)
+        T = generate_random_output_batch(n, self.num_targets, self.round_targets)
+
+        return (X, lS_o, lS_i, T)
+
+    def __len__(self):
+        # WARNING: note that we produce bacthes of outputs in __getitem__
+        # therefore we should use num_batches rather than data_size below
+        return self.num_batches
+
+
+def collate_wrapper_random_offset(list_of_tuples):
+    # where each tuple is (X, lS_o, lS_i, T)
+    (X, lS_o, lS_i, T) = list_of_tuples[0]
+    return (X, torch.stack(lS_o), lS_i, T)
+
+
+def collate_wrapper_random_length(list_of_tuples):
+    # where each tuple is (X, lS_o, lS_i, T)
+    (X, lS_o, lS_i, T) = list_of_tuples[0]
+    return (X, offset_to_length_converter(torch.stack(lS_o), lS_i), lS_i, T)
+
+
+def make_random_data_and_loader(
+    args,
+    ln_emb,
+    m_den,
+    offset_to_length_converter=False,
+):
+    train_data = RandomDataset(
+        m_den,
+        ln_emb,
+        args.data_size,
+        args.num_batches,
+        args.mini_batch_size,
+        args.num_indices_per_lookup,
+        args.num_indices_per_lookup_fixed,
+        1,  # num_targets
+        args.round_targets,
+        args.data_generation,
+        args.data_trace_file,
+        args.data_trace_enable_padding,
+        reset_seed_on_access=True,
+        rand_data_dist=args.rand_data_dist,
+        rand_data_min=args.rand_data_min,
+        rand_data_max=args.rand_data_max,
+        rand_data_mu=args.rand_data_mu,
+        rand_data_sigma=args.rand_data_sigma,
+        rand_seed=args.numpy_rand_seed,
+    )  # WARNING: generates a batch of lookups at once
+
+    test_data = RandomDataset(
+        m_den,
+        ln_emb,
+        args.data_size,
+        args.num_batches,
+        args.mini_batch_size,
+        args.num_indices_per_lookup,
+        args.num_indices_per_lookup_fixed,
+        1,  # num_targets
+        args.round_targets,
+        args.data_generation,
+        args.data_trace_file,
+        args.data_trace_enable_padding,
+        reset_seed_on_access=True,
+        rand_data_dist=args.rand_data_dist,
+        rand_data_min=args.rand_data_min,
+        rand_data_max=args.rand_data_max,
+        rand_data_mu=args.rand_data_mu,
+        rand_data_sigma=args.rand_data_sigma,
+        rand_seed=args.numpy_rand_seed,
+    )
+
+    collate_wrapper_random = collate_wrapper_random_offset
+    if offset_to_length_converter:
+        collate_wrapper_random = collate_wrapper_random_length
+
+    train_loader = torch.utils.data.DataLoader(
+        train_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,  # True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        test_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,  # True
+    )
+    return train_data, train_loader, test_data, test_loader
+
+
+def generate_random_data(
+    m_den,
+    ln_emb,
+    data_size,
+    num_batches,
+    mini_batch_size,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    num_targets=1,
+    round_targets=False,
+    data_generation="random",
+    trace_file="",
+    enable_padding=False,
+    length=False,  # length for caffe2 version (except dlrm_s_caffe2)
+):
+    nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+    if num_batches != 0:
+        nbatches = num_batches
+        data_size = nbatches * mini_batch_size
+    # print("Total number of batches %d" % nbatches)
+
+    # inputs
+    lT = []
+    lX = []
+    lS_offsets = []
+    lS_indices = []
+    for j in range(0, nbatches):
+        # number of data points in a batch
+        n = min(mini_batch_size, data_size - (j * mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if data_generation == "random":
+            (Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                length,
+            )
+        elif data_generation == "synthetic":
+            (Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                trace_file,
+                enable_padding,
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + data_generation + " is not supported"
+            )
+        # dense feature
+        lX.append(Xt)
+        # sparse feature (sparse indices)
+        lS_offsets.append(lS_emb_offsets)
+        lS_indices.append(lS_emb_indices)
+
+        # generate a batch of target (probability of a click)
+        P = generate_random_output_batch(n, num_targets, round_targets)
+        lT.append(P)
+
+    return (nbatches, lX, lS_offsets, lS_indices, lT)
+
+
+def generate_random_output_batch(n, num_targets, round_targets=False):
+    # target (probability of a click)
+    if round_targets:
+        P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32)
+    else:
+        P = ra.rand(n, num_targets).astype(np.float32)
+
+    return torch.tensor(P)
+
+
+# uniform ditribution (input data)
+def generate_uniform_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    length,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            r = ra.random(sparse_group_size)
+            sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            if length:  # for caffe2 version
+                lS_batch_offsets += [sparse_group_size]
+            else:
+                lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+# random data from uniform or gaussian ditribution (input data)
+def generate_dist_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    rand_data_dist,
+    rand_data_min,
+    rand_data_max,
+    rand_data_mu,
+    rand_data_sigma,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            if rand_data_dist == "gaussian":
+                if rand_data_mu == -1:
+                    rand_data_mu = (rand_data_max + rand_data_min) / 2.0
+                r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size)
+                sparse_group = np.clip(r, rand_data_min, rand_data_max)
+                sparse_group = np.unique(sparse_group).astype(np.int64)
+            elif rand_data_dist == "uniform":
+                r = ra.random(sparse_group_size)
+                sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            else:
+                raise (
+                    rand_data_dist,
+                    "distribution is not supported. \
+                     please select uniform or gaussian",
+                )
+
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+# synthetic distribution (input data)
+def generate_synthetic_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    trace_file,
+    enable_padding=False,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for i, size in enumerate(ln_emb):
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            file_path = trace_file
+            line_accesses, list_sd, cumm_sd = read_dist_from_file(
+                file_path.replace("j", str(i))
+            )
+            # debug prints
+            # print("input")
+            # print(line_accesses); print(list_sd); print(cumm_sd);
+            # print(sparse_group_size)
+            # approach 1: rand
+            # r = trace_generate_rand(
+            #     line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            # )
+            # approach 2: lru
+            r = trace_generate_lru(
+                line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            )
+            # WARNING: if the distribution in the file is not consistent
+            # with embedding table dimensions, below mod guards against out
+            # of range access
+            sparse_group = np.unique(r).astype(np.int64)
+            minsg = np.min(sparse_group)
+            maxsg = np.max(sparse_group)
+            if (minsg < 0) or (size <= maxsg):
+                print(
+                    "WARNING: distribution is inconsistent with embedding "
+                    + "table size (using mod to recover and continue)"
+                )
+                sparse_group = np.mod(sparse_group, size).astype(np.int64)
+            # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
+    u = ra.rand(1)
+    if i < max_i:
+        # only generate stack distances up to the number of new references seen so far
+        j = bisect.bisect(cumm_val, i) - 1
+        fi = cumm_dist[j]
+        u *= fi  # shrink distribution support to exclude last values
+    elif enable_padding:
+        # WARNING: disable generation of new references (once all have been seen)
+        fi = cumm_dist[0]
+        u = (1.0 - fi) * u + fi  # remap distribution support to exclude first value
+
+    for j, f in enumerate(cumm_dist):
+        if u <= f:
+            return cumm_val[j]
+
+
+# WARNING: global define, must be consistent across all synthetic functions
+cache_line_size = 1
+
+
+def trace_generate_lru(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)
+    i = 0
+    ztrace = deque()
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses[0]
+            del line_accesses[0]
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            del line_accesses[l - sd]
+            line_accesses.append(line_ref)
+        # save generated memory reference
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_generate_rand(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)  # !!!Unique,
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_profile(trace, enable_padding=False):
+    # number of elements in the array (assuming 1D)
+    # n = trace.size
+
+    rstack = deque()  # S
+    stack_distances = deque()  # SDS
+    line_accesses = deque()  # L
+    for x in trace:
+        r = np.uint64(x / cache_line_size)
+        l = len(rstack)
+        try:  # found #
+            i = rstack.index(r)
+            # WARNING: I believe below is the correct depth in terms of meaning of the
+            #          algorithm, but that is not what seems to be in the paper alg.
+            #          -1 can be subtracted if we defined the distance between
+            #          consecutive accesses (e.g. r, r) as 0 rather than 1.
+            sd = l - i  # - 1
+            # push r to the end of stack_distances
+            stack_distances.appendleft(sd)
+            # remove r from its position and insert to the top of stack
+            del rstack[i]  # rstack.remove(r)
+            rstack.append(r)
+        except ValueError:  # not found #
+            sd = 0  # -1
+            # push r to the end of stack_distances/line_accesses
+            stack_distances.appendleft(sd)
+            line_accesses.appendleft(r)
+            # push r to the top of stack
+            rstack.append(r)
+
+    if enable_padding:
+        # WARNING: notice that as the ratio between the number of samples (l)
+        # and cardinality [c] of a sample increases the probability of
+        # generating a sample gets smaller and smaller because there are
+        # few new samples compared to repeated samples. This means that for a
+        # long trace with relatively small cardinality it will take longer to
+        # generate all new samples and therefore obtain full distribution support
+        # and hence it takes longer for distribution to resemble the original.
+        # Therefore, we may pad the number of new samples to be on par with
+        # average number of samples l/c artificially.
+        l = len(stack_distances)
+        c = max(stack_distances)
+        padding = int(np.ceil(l / c))
+        stack_distances = stack_distances + [0] * padding
+
+    return (rstack, stack_distances, line_accesses)
+
+
+# auxiliary read/write routines
+def read_trace_from_file(file_path):
+    try:
+        with open(file_path) as f:
+            if args.trace_file_binary_type:
+                array = np.fromfile(f, dtype=np.uint64)
+                trace = array.astype(np.uint64).tolist()
+            else:
+                line = f.readline()
+                trace = list(map(lambda x: np.uint64(x), line.split(", ")))
+            return trace
+    except Exception:
+        print(f"ERROR: trace file '{file_path}' is not available.")
+
+
+def write_trace_to_file(file_path, trace):
+    try:
+        if args.trace_file_binary_type:
+            with open(file_path, "wb+") as f:
+                np.array(trace).astype(np.uint64).tofile(f)
+        else:
+            with open(file_path, "w+") as f:
+                s = str(list(trace))
+                f.write(s[1 : len(s) - 1])
+    except Exception:
+        print("ERROR: no output trace file has been provided")
+
+
+def read_dist_from_file(file_path):
+    try:
+        with open(file_path, "r") as f:
+            lines = f.read().splitlines()
+    except Exception:
+        print("{file_path} Wrong file or file path")
+    # read unique accesses
+    unique_accesses = [int(el) for el in lines[0].split(", ")]
+    # read cumulative distribution (elements are passed as two separate lists)
+    list_sd = [int(el) for el in lines[1].split(", ")]
+    cumm_sd = [float(el) for el in lines[2].split(", ")]
+
+    return unique_accesses, list_sd, cumm_sd
+
+
+def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
+    try:
+        with open(file_path, "w") as f:
+            # unique_acesses
+            s = str(list(unique_accesses))
+            f.write(s[1 : len(s) - 1] + "\n")
+            # list_sd
+            s = str(list_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # cumm_sd
+            s = str(list(cumm_sd))
+            f.write(s[1 : len(s) - 1] + "\n")
+    except Exception:
+        print("Wrong file or file path")
+
+
+if __name__ == "__main__":
+    import argparse
+    import operator
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
+    parser.add_argument("--trace-file", type=str, default="./input/trace.log")
+    parser.add_argument("--trace-file-binary-type", type=bool, default=False)
+    parser.add_argument("--trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--dist-file", type=str, default="./input/dist.log")
+    parser.add_argument(
+        "--synthetic-file", type=str, default="./input/trace_synthetic.log"
+    )
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--print-precision", type=int, default=5)
+    args = parser.parse_args()
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+
+    ### read trace ###
+    trace = read_trace_from_file(args.trace_file)
+    # print(trace)
+
+    ### profile trace ###
+    (_, stack_distances, line_accesses) = trace_profile(
+        trace, args.trace_enable_padding
+    )
+    stack_distances.reverse()
+    line_accesses.reverse()
+    # print(line_accesses)
+    # print(stack_distances)
+
+    ### compute probability distribution ###
+    # count items
+    l = len(stack_distances)
+    dc = sorted(
+        collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
+    )
+
+    # create a distribution
+    list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc))  # x = tuple_x_k[0]
+    dist_sd = list(
+        map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
+    )  # k = tuple_x_k[1]
+    cumm_sd = deque()  # np.cumsum(dc).tolist() #prefixsum
+    for i, (_, k) in enumerate(dc):
+        if i == 0:
+            cumm_sd.append(k / float(l))
+        else:
+            # add the 2nd element of the i-th tuple in the dist_sd list
+            cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
+
+    ### write stack_distance and line_accesses to a file ###
+    write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
+
+    ### generate corresponding synthetic ###
+    # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
+    synthetic_trace = trace_generate_lru(
+        line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    )
+    # synthetic_trace = trace_generate_rand(
+    #     line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    # )
+    write_trace_to_file(args.synthetic_file, synthetic_trace)
--- a/dlrm_s_caffe2.py
+++ b/dlrm_s_caffe2.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+
+import functools
+
+# others
+import operator
+import time
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+
+# data generation
+import dlrm_data_pytorch as dp
+
+# numpy
+import numpy as np
+import sklearn.metrics
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    try:
+        import caffe2.python.onnx.frontend
+        import onnx
+    except ImportError as error:
+        print("Unable to import onnx or caffe2.python.onnx.frontend ", error)
+
+# from caffe2.python import data_parallel_model
+
+# caffe2
+from caffe2.proto import caffe2_pb2
+from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace
+
+"""
+# auxiliary routine used to split input on the mini-bacth dimension
+def where_to_split(mini_batch_size, ndevices, _add_leftover=False):
+    n = (mini_batch_size + ndevices - 1) // ndevices  # ceiling
+    l = mini_batch_size - n * (ndevices - 1)  # leftover
+    s = [n] * (ndevices - 1)
+    if _add_leftover:
+        ls += [l if l > 0 else n]
+    return ls
+"""
+
+
+### define dlrm in Caffe2 ###
+class DLRM_Net(object):
+    def FeedBlobWrapper(self, tag, val, add_prefix=True, split=False, device_id=-1):
+        if self.ndevices > 1 and add_prefix:
+            if split:
+                # split across devices
+                mini_batch_size = val.shape[0]
+                # approach 1: np and caffe2 operators assume the mini-batch size is
+                # divisible exactly by the number of available devices
+                if mini_batch_size % self.ndevices != 0:
+                    sys.exit(
+                        "ERROR: caffe2 net assumes that the mini_batch_size "
+                        + str(mini_batch_size)
+                        + " is evenly divisible by the number of available devices"
+                        + str(self.ndevices)
+                    )
+                vals = np.split(val, self.ndevices, axis=0)
+                """
+                # approach 2: np and caffe2 operators do not assume exact divisibility
+                if args.mini_batch_size != mini_batch_size:
+                    sys.exit("ERROR: caffe2 net was prepared for mini-batch size "
+                             + str(args.mini_batch_size)
+                             + " which is different from current mini-batch size "
+                             + str(mini_batch_size) + " being passed to it. "
+                             + "This is common for the last mini-batch, when "
+                             + "mini-batch size does not evenly divided the number of "
+                             + "elements in the data set.")
+                ls = where_to_split(mini_batch_size, self.ndevices)
+                vals = np.split(val, ls, axis=0)
+                """
+                # feed to multiple devices
+                for d in range(self.ndevices):
+                    tag_on_device = "gpu_" + str(d) + "/" + tag
+                    _d = core.DeviceOption(workspace.GpuDeviceType, d)
+                    workspace.FeedBlob(tag_on_device, vals[d], device_option=_d)
+            else:
+                # feed to multiple devices
+                for d in range(self.ndevices):
+                    tag_on_device = "gpu_" + str(d) + "/" + tag
+                    _d = core.DeviceOption(workspace.GpuDeviceType, d)
+                    workspace.FeedBlob(tag_on_device, val, device_option=_d)
+        else:
+            # feed to a single device (named or not)
+            if device_id >= 0:
+                _d = core.DeviceOption(workspace.GpuDeviceType, device_id)
+                workspace.FeedBlob(tag, val, device_option=_d)
+            else:
+                workspace.FeedBlob(tag, val)
+
+    def FetchBlobWrapper(self, tag, add_prefix=True, reduce_across=None, device_id=-1):
+        if self.ndevices > 1 and add_prefix:
+            # fetch from multiple devices
+            vals = []
+            for d in range(self.ndevices):
+                if tag.__class__ == list:
+                    tag_on_device = tag[d]
+                else:
+                    tag_on_device = "gpu_" + str(0) + "/" + tag
+                val = workspace.FetchBlob(tag_on_device)
+                vals.append(val)
+            # reduce across devices
+            if reduce_across == "add":
+                return functools.reduce(operator.add, vals)
+            elif reduce_across == "concat":
+                return np.concatenate(vals)
+            else:
+                return vals
+        else:
+            # fetch from a single device (named or not)
+            if device_id >= 0:
+                tag_on_device = "gpu_" + str(device_id) + "/" + tag
+                return workspace.FetchBlob(tag_on_device)
+            else:
+                return workspace.FetchBlob(tag)
+
+    def AddLayerWrapper(
+        self, layer, inp_blobs, out_blobs, add_prefix=True, reset_grad=False, **kwargs
+    ):
+        # auxiliary routine to adjust tags
+        def adjust_tag(blobs, on_device):
+            if blobs.__class__ == str:
+                _blobs = on_device + blobs
+            elif blobs.__class__ == list:
+                _blobs = list(map(lambda tag: on_device + tag, blobs))
+            else:  # blobs.__class__ == model_helper.ModelHelper or something else
+                _blobs = blobs
+            return _blobs
+
+        if self.ndevices > 1 and add_prefix:
+            # add layer on multiple devices
+            ll = []
+            for d in range(self.ndevices):
+                # add prefix on_device
+                on_device = "gpu_" + str(d) + "/"
+                _inp_blobs = adjust_tag(inp_blobs, on_device)
+                _out_blobs = adjust_tag(out_blobs, on_device)
+                # WARNING: reset_grad option was exlusively designed for WeightedSum
+                #         with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
+                if reset_grad:
+                    w_grad = self.gradientMap[_inp_blobs[0]]
+                    _inp_blobs[2] = w_grad
+                # add layer to the model
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    if kwargs:
+                        new_layer = layer(_inp_blobs, _out_blobs, **kwargs)
+                    else:
+                        new_layer = layer(_inp_blobs, _out_blobs)
+                ll.append(new_layer)
+            return ll
+        else:
+            # add layer on a single device
+            # WARNING: reset_grad option was exlusively designed for WeightedSum
+            #          with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
+            if reset_grad:
+                w_grad = self.gradientMap[inp_blobs[0]]
+                inp_blobs[2] = w_grad
+            # add layer to the model
+            if kwargs:
+                new_layer = layer(inp_blobs, out_blobs, **kwargs)
+            else:
+                new_layer = layer(inp_blobs, out_blobs)
+            return new_layer
+
+    def create_mlp(self, ln, sigmoid_layer, model, tag):
+        (tag_layer, tag_in, tag_out) = tag
+
+        # build MLP layer by layer
+        layers = []
+        weights = []
+        for i in range(1, ln.size):
+            n = ln[i - 1]
+            m = ln[i]
+
+            # create tags
+            tag_fc_w = tag_layer + ":::" + "fc" + str(i) + "_w"
+            tag_fc_b = tag_layer + ":::" + "fc" + str(i) + "_b"
+            tag_fc_y = tag_layer + ":::" + "fc" + str(i) + "_y"
+            tag_fc_z = tag_layer + ":::" + "fc" + str(i) + "_z"
+            if i == ln.size - 1:
+                tag_fc_z = tag_out
+            weights.append(tag_fc_w)
+            weights.append(tag_fc_b)
+
+            # initialize the weights
+            # approach 1: custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            b = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            self.FeedBlobWrapper(tag_fc_w, W)
+            self.FeedBlobWrapper(tag_fc_b, b)
+            # approach 2: caffe2 xavier
+            # W = self.AddLayerWrapper(
+            #     model.param_init_net.XavierFill,
+            #     [],
+            #     tag_fc_w,
+            #     shape=[m, n]
+            # )
+            # b = self.AddLayerWrapper(
+            #     model.param_init_net.ConstantFill,
+            #     [],
+            #     tag_fc_b,
+            #     shape=[m]
+            # )
+
+            # initialize the MLP's momentum for the Adagrad optimizer
+            if self.emb_optimizer in ["adagrad", "rwsadagrad"]:
+                # momentum of the weights
+                self.FeedBlobWrapper(
+                    "momentum_mlp_{}_{}".format(tag_layer, 2 * i - 1),
+                    np.full((m, n), 0, dtype=np.float32),
+                )
+                # momentum of the biases
+                self.FeedBlobWrapper(
+                    "momentum_mlp_{}_{}".format(tag_layer, 2 * i),
+                    np.full((m), 0, dtype=np.float32),
+                )
+
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_fc_w] = (onnx.TensorProto.FLOAT, W.shape)
+                self.onnx_tsd[tag_fc_b] = (onnx.TensorProto.FLOAT, b.shape)
+
+            # approach 1: construct fully connected operator using model.net
+            fc = self.AddLayerWrapper(
+                model.net.FC, [tag_in, tag_fc_w, tag_fc_b], tag_fc_y
+            )
+            # approach 2: construct fully connected operator using brew
+            # https://github.com/caffe2/tutorials/blob/master/MNIST.ipynb
+            # fc = brew.fc(model, layer, tag_fc_w, dim_in=m, dim_out=n)
+            layers.append(fc)
+
+            if i == sigmoid_layer:
+                # approach 1: construct sigmoid operator using model.net
+                layer = self.AddLayerWrapper(model.net.Sigmoid, tag_fc_y, tag_fc_z)
+                # approach 2: using brew (which currently does not support sigmoid)
+                # tag_sigm = tag_layer + ":::" + "sigmoid" + str(i)
+                # layer = brew.sigmoid(model,fc,tag_sigmoid)
+            else:
+                # approach 1: construct relu operator using model.net
+                layer = self.AddLayerWrapper(model.net.Relu, tag_fc_y, tag_fc_z)
+                # approach 2: using brew
+                # tag_relu = tag_layer + ":::" + "relu" + str(i)
+                # layer = brew.relu(model,fc,tag_relu)
+            tag_in = tag_fc_z
+            layers.append(layer)
+
+        # WARNING: the dependency between layers is implicit in the tags,
+        # so only the last layer is added to the layers list. It will
+        # later be used for interactions.
+        return layers, weights
+
+    def create_emb(self, m, ln, model, tag):
+        (tag_layer, tag_in, tag_out) = tag
+        emb_l = []
+        weights_l = []
+        vw_l = []
+        for i in range(0, ln.size):
+            n = ln[i]
+
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_i"
+            tbl_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_w"
+            sum_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_z"
+            weights_l.append(tbl_s)
+
+            # initialize the weights
+            # approach 1a: custom
+            W = np.random.uniform(
+                low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+            ).astype(np.float32)
+            # approach 1b: numpy rand
+            # W = ra.rand(n, m).astype(np.float32)
+            self.FeedBlobWrapper(tbl_s, W, False, device_id=d)
+            # approach 2: caffe2 xavier
+            # with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+            #     W = model.param_init_net.XavierFill([], tbl_s, shape=[n, m])
+            # save the blob shapes for latter (only needed if onnx is requested)
+
+            # initialize the embedding's momentum for the Adagrad optimizer
+            if self.emb_optimizer == "adagrad":
+                self.FeedBlobWrapper(
+                    "momentum_emb_{}".format(i),
+                    np.full((n, m), 0),
+                    add_prefix=False,
+                    device_id=d,
+                )
+            elif self.emb_optimizer == "rwsadagrad":
+                self.FeedBlobWrapper(
+                    "momentum_emb_{}".format(i),
+                    np.full((n), 0),
+                    add_prefix=False,
+                    device_id=d,
+                )
+
+            if self.save_onnx:
+                self.onnx_tsd[tbl_s] = (onnx.TensorProto.FLOAT, W.shape)
+
+            # create operator
+            if self.weighted_pooling is not None:
+                vw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_v"
+                psw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_s"
+                VW = np.ones(n).astype(np.float32)
+                self.FeedBlobWrapper(vw_s, VW, False, device_id=d)
+                if self.weighted_pooling == "learned":
+                    vw_l.append(vw_s)
+                    grad_on_weights = True
+                else:
+                    grad_on_weights = False
+                if self.save_onnx:
+                    self.onnx_tsd[vw_s] = (onnx.TensorProto.FLOAT, VW.shape)
+                if self.ndevices <= 1:
+                    PSW = model.net.Gather([vw_s, ind_s], [psw_s])
+                    EE = model.net.SparseLengthsWeightedSum(
+                        [tbl_s, PSW, ind_s, len_s],
+                        [sum_s],
+                        grad_on_weights=grad_on_weights,
+                    )
+                else:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        PSW = model.net.Gather([vw_s, ind_s], [psw_s])
+                        EE = model.net.SparseLengthsWeightedSum(
+                            [tbl_s, PSW, ind_s, len_s],
+                            [sum_s],
+                            grad_on_weights=grad_on_weights,
+                        )
+            else:
+                if self.ndevices <= 1:
+                    EE = model.net.SparseLengthsSum([tbl_s, ind_s, len_s], [sum_s])
+                else:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        EE = model.net.SparseLengthsSum([tbl_s, ind_s, len_s], [sum_s])
+            emb_l.append(EE)
+
+        return emb_l, weights_l, vw_l
+
+    def create_interactions(self, x, ly, model, tag):
+        (tag_dense_in, tag_sparse_in, tag_int_out) = tag
+
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            tag_int_out_info = tag_int_out + "_info"
+            T, T_info = model.net.Concat(
+                x + ly,
+                [tag_int_out + "_cat_axis0", tag_int_out_info + "_cat_axis0"],
+                axis=1,
+                add_axis=1,
+            )
+            # perform a dot product
+            Z = model.net.BatchMatMul([T, T], tag_int_out + "_matmul", trans_b=1)
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = model.net.Flatten(Z, tag_int_out + "_flatten", axis=1)
+            # approach 2: unique
+            Zflat_all = model.net.Flatten(Z, tag_int_out + "_flatten_all", axis=1)
+            Zflat = model.net.BatchGather(
+                [Zflat_all, tag_int_out + "_tril_indices"], tag_int_out + "_flatten"
+            )
+            R, R_info = model.net.Concat(
+                x + [Zflat], [tag_int_out, tag_int_out_info], axis=1
+            )
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            tag_int_out_info = tag_int_out + "_info"
+            R, R_info = model.net.Concat(
+                x + ly, [tag_int_out, tag_int_out_info], axis=1
+            )
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def create_sequential_forward_ops(self):
+        # embeddings
+        tag = (self.temb, self.tsin, self.tsout)
+        self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
+            self.m_spa, self.ln_emb, self.model, tag
+        )
+        # bottom mlp
+        tag = (self.tbot, self.tdin, self.tdout)
+        self.bot_l, self.bot_w = self.create_mlp(
+            self.ln_bot, self.sigmoid_bot, self.model, tag
+        )
+        # interactions
+        tag = (self.tdout, self.tsout, self.tint)
+        Z = self.create_interactions([self.bot_l[-1]], self.emb_l, self.model, tag)
+
+        # top mlp
+        tag = (self.ttop, Z, self.tout)
+        self.top_l, self.top_w = self.create_mlp(
+            self.ln_top, self.sigmoid_top, self.model, tag
+        )
+        # debug prints
+        # print(self.emb_l)
+        # print(self.bot_l)
+        # print(self.top_l)
+
+        # setup the last output variable
+        self.last_output = self.top_l[-1]
+
+    def create_parallel_forward_ops(self):
+        # distribute embeddings (model parallelism)
+        tag = (self.temb, self.tsin, self.tsout)
+        self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
+            self.m_spa, self.ln_emb, self.model, tag
+        )
+        # replicate mlp (data parallelism)
+        tag = (self.tbot, self.tdin, self.tdout)
+        self.bot_l, self.bot_w = self.create_mlp(
+            self.ln_bot, self.sigmoid_bot, self.model, tag
+        )
+
+        # add communication (butterfly shuffle)
+        t_list = []
+        for i, emb_output in enumerate(self.emb_l):
+            # split input
+            src_d = i % self.ndevices
+            lo = [emb_output + "_split_" + str(d) for d in range(self.ndevices)]
+            # approach 1: np and caffe2 operators assume the mini-batch size is
+            # divisible exactly by the number of available devices
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
+                self.model.net.Split(emb_output, lo, axis=0)
+            """
+            # approach 2: np and caffe2 operators do not assume exact divisibility
+            ls = where_to_split(args.mini_batch_size, self.ndevices, _add_leftover=True)
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
+                emb_output_split = self.model.net.Split(
+                    emb_output, lo, split=lp, axis=0
+                )
+            """
+            # scatter
+            y = []
+            for dst_d in range(len(lo)):
+                src_blob = lo[dst_d]
+                dst_blob = str(src_blob).replace(
+                    "gpu_" + str(src_d), "gpu_" + str(dst_d), 1
+                )
+                if src_blob != dst_blob:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, dst_d)
+                    ):
+                        blob = self.model.Copy(src_blob, dst_blob)
+                else:
+                    blob = dst_blob
+                y.append(blob)
+            t_list.append(y)
+        # adjust lists to be ordered per device
+        x = list(map(lambda x: list(x), zip(*self.bot_l)))
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+
+        # interactions
+        for d in range(self.ndevices):
+            on_device = "gpu_" + str(d) + "/"
+            tag = (
+                on_device + self.tdout,
+                on_device + self.tsout,
+                on_device + self.tint,
+            )
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                self.create_interactions([x[d][-1]], ly[d], self.model, tag)
+
+        # replicate mlp (data parallelism)
+        tag = (self.ttop, self.tint, self.tout)
+        self.top_l, self.top_w = self.create_mlp(
+            self.ln_top, self.sigmoid_top, self.model, tag
+        )
+
+        # debug prints
+        # print(self.model.net.Proto(),end='\n')
+        # sys.exit("ERROR: debugging")
+
+        # setup the last output variable
+        self.last_output = self.top_l[-1]
+
+    def __init__(
+        self,
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        arch_interaction_op,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        save_onnx=False,
+        model=None,
+        test_net=None,
+        tag=None,
+        ndevices=-1,
+        forward_ops=True,
+        enable_prof=False,
+        weighted_pooling=None,
+        emb_optimizer="sgd",
+    ):
+        super(DLRM_Net, self).__init__()
+
+        # init model
+        if model is None:
+            global_init_opt = ["caffe2", "--caffe2_log_level=0"]
+            if enable_prof:
+                global_init_opt += [
+                    "--logtostderr=0",
+                    "--log_dir=$HOME",
+                    "--caffe2_logging_print_net_summary=1",
+                ]
+            workspace.GlobalInit(global_init_opt)
+            self.set_tags()
+            self.model = model_helper.ModelHelper(name="DLRM", init_params=True)
+            self.test_net = None
+        else:
+            # WARNING: assume that workspace and tags have been initialized elsewhere
+            self.set_tags(
+                tag[0],
+                tag[1],
+                tag[2],
+                tag[3],
+                tag[4],
+                tag[5],
+                tag[6],
+                tag[7],
+                tag[8],
+                tag[9],
+            )
+            self.model = model
+            self.test_net = test_net
+
+        # save arguments
+        self.m_spa = m_spa
+        self.ln_emb = ln_emb
+        self.ln_bot = ln_bot
+        self.ln_top = ln_top
+        self.arch_interaction_op = arch_interaction_op
+        self.arch_interaction_itself = arch_interaction_itself
+        self.sigmoid_bot = sigmoid_bot
+        self.sigmoid_top = sigmoid_top
+        self.save_onnx = save_onnx
+        self.ndevices = ndevices
+        self.emb_optimizer = emb_optimizer
+        if weighted_pooling is not None and weighted_pooling != "fixed":
+            self.weighted_pooling = "learned"
+        else:
+            self.weighted_pooling = weighted_pooling
+        # onnx types and shapes dictionary
+        if self.save_onnx:
+            self.onnx_tsd = {}
+        # create forward operators
+        if forward_ops:
+            if self.ndevices <= 1:
+                return self.create_sequential_forward_ops()
+            else:
+                return self.create_parallel_forward_ops()
+
+    def set_tags(
+        self,
+        _tag_layer_top_mlp="top",
+        _tag_layer_bot_mlp="bot",
+        _tag_layer_embedding="emb",
+        _tag_feature_dense_in="dense_in",
+        _tag_feature_dense_out="dense_out",
+        _tag_feature_sparse_in="sparse_in",
+        _tag_feature_sparse_out="sparse_out",
+        _tag_interaction="interaction",
+        _tag_dense_output="prob_click",
+        _tag_dense_target="target",
+    ):
+        # layer tags
+        self.ttop = _tag_layer_top_mlp
+        self.tbot = _tag_layer_bot_mlp
+        self.temb = _tag_layer_embedding
+        # dense feature tags
+        self.tdin = _tag_feature_dense_in
+        self.tdout = _tag_feature_dense_out
+        # sparse feature tags
+        self.tsin = _tag_feature_sparse_in
+        self.tsout = _tag_feature_sparse_out
+        # output and target tags
+        self.tint = _tag_interaction
+        self.ttar = _tag_dense_target
+        self.tout = _tag_dense_output
+
+    def parameters(self):
+        return self.model
+
+    def get_loss(self):
+        return self.FetchBlobWrapper(self.loss, reduce_across="add")
+
+    def get_output(self):
+        return self.FetchBlobWrapper(self.last_output, reduce_across="concat")
+
+    def create(self, X, S_lengths, S_indices, T):
+        self.create_input(X, S_lengths, S_indices, T)
+        self.create_model(X, S_lengths, S_indices, T)
+
+    def create_input(self, X, S_lengths, S_indices, T):
+        # feed input data to blobs
+        self.FeedBlobWrapper(self.tdin, X, split=True)
+        # save the blob shapes for latter (only needed if onnx is requested)
+        if self.save_onnx:
+            self.onnx_tsd[self.tdin] = (onnx.TensorProto.FLOAT, X.shape)
+
+        for i in range(len(self.emb_l)):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
+            self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
+            self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                lshape = (len(S_lengths[i]),)  # =args.mini_batch_size
+                ishape = (len(S_indices[i]),)
+                self.onnx_tsd[len_s] = (onnx.TensorProto.INT32, lshape)
+                self.onnx_tsd[ind_s] = (onnx.TensorProto.INT32, ishape)
+
+        # feed target data to blobs
+        if T is not None:
+            zeros_fp32 = np.zeros(T.shape).astype(np.float32)
+            self.FeedBlobWrapper(self.ttar, zeros_fp32, split=True)
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[self.ttar] = (onnx.TensorProto.FLOAT, T.shape)
+
+    def create_model(self, X, S_lengths, S_indices, T):
+        # setup tril indices for the interactions
+        offset = 1 if self.arch_interaction_itself else 0
+        num_fea = len(self.emb_l) + 1
+        tril_indices = np.array(
+            [j + i * num_fea for i in range(num_fea) for j in range(i + offset)]
+        )
+        self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices)
+
+        # create compute graph
+        if T is not None:
+            # WARNING: RunNetOnce call is needed only if we use brew and ConstantFill.
+            # We could use direct calls to self.model functions above to avoid it
+            workspace.RunNetOnce(self.model.param_init_net)
+            workspace.CreateNet(self.model.net)
+            if self.test_net is not None:
+                workspace.CreateNet(self.test_net)
+
+    def run(self, X, S_lengths, S_indices, T, test_net=False, enable_prof=False):
+        # feed input data to blobs
+        # dense features
+        self.FeedBlobWrapper(self.tdin, X, split=True)
+        # sparse features
+        for i in range(len(self.emb_l)):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
+            self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
+            self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
+
+        # feed target data to blobs if needed
+        if T is not None:
+            self.FeedBlobWrapper(self.ttar, T, split=True)
+            # execute compute graph
+            if test_net:
+                workspace.RunNet(self.test_net)
+            else:
+                if enable_prof:
+                    workspace.C.benchmark_net(self.model.net.Name(), 0, 1, True)
+                else:
+                    workspace.RunNet(self.model.net)
+        # debug prints
+        # print("intermediate")
+        # print(self.FetchBlobWrapper(self.bot_l[-1]))
+        # for tag_emb in self.emb_l:
+        #     print(self.FetchBlobWrapper(tag_emb))
+        # print(self.FetchBlobWrapper(self.tint))
+
+    def MSEloss(self, scale=1.0):
+        # add MSEloss to the model
+        self.AddLayerWrapper(self.model.SquaredL2Distance, [self.tout, self.ttar], "sd")
+        self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=2.0 * scale)
+        # WARNING: "loss" is a special tag and should not be changed
+        self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
+
+    def BCEloss(self, scale=1.0, threshold=0.0):
+        # add BCEloss to the mode
+        if 0.0 < threshold and threshold < 1.0:
+            self.AddLayerWrapper(
+                self.model.Clip,
+                self.tout,
+                "tout_c",
+                min=threshold,
+                max=(1.0 - threshold),
+            )
+            self.AddLayerWrapper(self.model.MakeTwoClass, "tout_c", "tout_2c")
+        else:
+            self.AddLayerWrapper(self.model.MakeTwoClass, self.tout, "tout_2c")
+        self.AddLayerWrapper(self.model.LabelCrossEntropy, ["tout_2c", self.ttar], "sd")
+        # WARNING: "loss" is a special tag and should not be changed
+        if scale == 1.0:
+            self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd", "loss")
+        else:
+            self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=scale)
+            self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
+
+    def sgd_optimizer(
+        self, learning_rate, T=None, _gradientMap=None, sync_dense_params=True
+    ):
+        # create one, it and lr tags (or use them if already present)
+        if T is not None:
+            (tag_one, tag_it, tag_lr) = T
+        else:
+            (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
+
+            # approach 1: feed values directly
+            # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
+            # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
+            # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
+            # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+            #                           base_lr=-1 * learning_rate, policy="fixed")
+            # approach 2: use brew
+            self.AddLayerWrapper(
+                self.model.param_init_net.ConstantFill,
+                [],
+                tag_one,
+                shape=[1],
+                value=1.0,
+            )
+            self.AddLayerWrapper(brew.iter, self.model, tag_it)
+            self.AddLayerWrapper(
+                self.model.LearningRate,
+                tag_it,
+                tag_lr,
+                base_lr=-1 * learning_rate,
+                policy="fixed",
+            )
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
+                self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
+
+        # create gradient maps (or use them if already present)
+        if _gradientMap is not None:
+            self.gradientMap = _gradientMap
+        else:
+            if self.loss.__class__ == list:
+                self.gradientMap = self.model.AddGradientOperators(self.loss)
+            else:
+                self.gradientMap = self.model.AddGradientOperators([self.loss])
+
+        # update weights
+        # approach 1: builtin function
+        # optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
+        # approach 2: custom code
+        # top MLP weight and bias
+        for w in self.top_w:
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.AddLayerWrapper(
+                self.model.WeightedSum, [w, tag_one, "", tag_lr], w, reset_grad=True
+            )
+        # bottom MLP weight and bias
+        for w in self.bot_w:
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.AddLayerWrapper(
+                self.model.WeightedSum, [w, tag_one, "", tag_lr], w, reset_grad=True
+            )
+        # update embeddings
+        for i, w in enumerate(self.emb_w):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            _tag_one = on_device + tag_one
+            _tag_lr = on_device + tag_lr
+            # pickup gradient
+            w_grad = self.gradientMap[w]
+            # update weights
+            if self.ndevices > 1:
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    self.model.ScatterWeightedSum(
+                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                    )
+            else:
+                self.model.ScatterWeightedSum(
+                    [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                )
+
+        # update per sample weights
+        if self.weighted_pooling == "learned":
+            for i, w in enumerate(self.emb_vw):
+                # select device
+                if self.ndevices > 1:
+                    d = i % self.ndevices
+                # create tags
+                on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+                _tag_one = on_device + tag_one
+                _tag_lr = on_device + tag_lr
+                # pickup gradient
+                w_grad = self.gradientMap[w]
+                # update weights
+                if self.ndevices > 1:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        self.model.ScatterWeightedSum(
+                            [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                        )
+                else:
+                    self.model.ScatterWeightedSum(
+                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                    )
+
+    def adagrad_optimizer(
+        self,
+        learning_rate,
+        T=None,
+        _gradientMap=None,
+        sync_dense_params=True,
+        epsilon=1e-10,
+        decay_=0.0,
+        weight_decay_=0.0,
+    ):
+        # create one, it and lr tags (or use them if already present)
+        if T is not None:
+            (tag_one, tag_it, tag_lr) = T
+        else:
+            (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
+
+            # approach 1: feed values directly
+            # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
+            # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
+            # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
+            # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+            #                           base_lr=-1 * learning_rate, policy="fixed")
+            # approach 2: use brew
+            self.AddLayerWrapper(
+                self.model.param_init_net.ConstantFill,
+                [],
+                tag_one,
+                shape=[1],
+                value=1.0,
+            )
+            self.AddLayerWrapper(brew.iter, self.model, tag_it)
+            self.AddLayerWrapper(
+                self.model.LearningRate,
+                tag_it,
+                tag_lr,
+                base_lr=-1 * learning_rate,
+                policy="fixed",
+            )
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
+                self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
+
+        # create gradient maps (or use them if already present)
+        if _gradientMap is not None:
+            self.gradientMap = _gradientMap
+        else:
+            if self.loss.__class__ == list:
+                self.gradientMap = self.model.AddGradientOperators(self.loss)
+            else:
+                self.gradientMap = self.model.AddGradientOperators([self.loss])
+
+        # update weights
+        # approach 1: builtin function
+        # optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
+        # approach 2: custom code
+        # top MLP weight and bias
+        for i, w in enumerate(self.top_w):
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.model.Adagrad(
+                [w, "momentum_mlp_top_{}".format(i + 1), self.gradientMap[w], tag_lr],
+                [w, "momentum_mlp_top_{}".format(i + 1)],
+                epsilon=epsilon,
+                decay_=decay_,
+                weight_decay_=weight_decay_,
+            )
+
+        # bottom MLP weight and bias
+        for i, w in enumerate(self.bot_w):
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.model.Adagrad(
+                [w, "momentum_mlp_bot_{}".format(i + 1), self.gradientMap[w], tag_lr],
+                [w, "momentum_mlp_bot_{}".format(i + 1)],
+                epsilon=epsilon,
+                decay_=decay_,
+                weight_decay_=weight_decay_,
+            )
+
+        # update embeddings
+        for i, w in enumerate(self.emb_w):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            _tag_one = on_device + tag_one
+            _tag_lr = on_device + tag_lr
+            # pickup gradient
+            w_grad = self.gradientMap[w]
+
+            # update weights
+            def add_optimizer():
+                self.model.Unique(
+                    w_grad.indices,
+                    ["unique_w_grad_indices", "remapping_w_grad_indices"],
+                )
+                self.model.UnsortedSegmentSum(
+                    [w_grad.values, "remapping_w_grad_indices"], "unique_w_grad_values"
+                )
+
+                if self.emb_optimizer == "adagrad":
+                    self.model.SparseAdagrad(
+                        [
+                            w,
+                            "momentum_emb_{}".format(i),
+                            "unique_w_grad_indices",
+                            "unique_w_grad_values",
+                            _tag_lr,
+                        ],
+                        [w, "momentum_emb_{}".format(i)],
+                        epsilon=epsilon,
+                        decay_=decay_,
+                        weight_decay_=weight_decay_,
+                    )
+
+                elif self.emb_optimizer == "rwsadagrad":
+                    self.model.RowWiseSparseAdagrad(
+                        [
+                            w,
+                            "momentum_emb_{}".format(i),
+                            "unique_w_grad_indices",
+                            "unique_w_grad_values",
+                            _tag_lr,
+                        ],
+                        [w, "momentum_emb_{}".format(i)],
+                        epsilon=epsilon,
+                        decay_=decay_,
+                        weight_decay_=weight_decay_,
+                    )
+
+            if self.ndevices > 1:
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    add_optimizer()
+            else:
+                add_optimizer()
+
+        # update per sample weights
+        if self.weighted_pooling == "learned":
+            for i, w in enumerate(self.emb_vw):
+                # select device
+                if self.ndevices > 1:
+                    d = i % self.ndevices
+                # create tags
+                on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+                _tag_one = on_device + tag_one
+                _tag_lr = on_device + tag_lr
+                # pickup gradient
+                w_grad = self.gradientMap[w]
+                # update weights
+                if self.ndevices > 1:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        self.model.ScatterWeightedSum(
+                            [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                        )
+                else:
+                    self.model.ScatterWeightedSum(
+                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                    )
+
+    def print_all(self):
+        # approach 1: all
+        print(workspace.Blobs(), end="\n")
+        for _, l in enumerate(workspace.Blobs()):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+        # approach 2: only summary
+        # for param in self.model.params:
+        #    self.model.Summarize(param, [], to_file=1)
+        #    self.model.Summarize(self.model.param_to_grad[param], [], to_file=1)
+
+    def print_weights(self):
+        for _, l in enumerate(self.emb_w):
+            # print(l)
+            print(self.FetchBlobWrapper(l, False))
+        if self.weighted_pooling == "learned":
+            for _, l in enumerate(self.emb_vw):
+                # print(l)
+                print(self.FetchBlobWrapper(l, False))
+        for _, l in enumerate(self.bot_w):
+            # print(l)
+            if self.ndevices > 1:
+                print(self.FetchBlobWrapper(l, False, device_id=0))
+            else:
+                print(self.FetchBlobWrapper(l))
+        for _, l in enumerate(self.top_w):
+            # print(l)
+            if self.ndevices > 1:
+                print(self.FetchBlobWrapper(l, False, device_id=0))
+            else:
+                print(self.FetchBlobWrapper(l))
+
+    def print_activations(self):
+        for _, l in enumerate(self.emb_l):
+            print(l)
+            print(self.FetchBlobWrapper(l, False))
+        for _, l in enumerate(self.bot_l):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+        print(self.tint)
+        print(self.FetchBlobWrapper(self.tint))
+        for _, l in enumerate(self.top_l):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+
+
+def define_metrics():
+    metrics = {
+        "loss": lambda y_true, y_score: sklearn.metrics.log_loss(
+            y_true=y_true, y_pred=y_score, labels=[0, 1]
+        ),
+        "recall": lambda y_true, y_score: sklearn.metrics.recall_score(
+            y_true=y_true, y_pred=np.round(y_score)
+        ),
+        "precision": lambda y_true, y_score: sklearn.metrics.precision_score(
+            y_true=y_true, y_pred=np.round(y_score)
+        ),
+        "f1": lambda y_true, y_score: sklearn.metrics.f1_score(
+            y_true=y_true, y_pred=np.round(y_score)
+        ),
+        "ap": sklearn.metrics.average_precision_score,
+        "roc_auc": sklearn.metrics.roc_auc_score,
+        "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
+            y_true=y_true, y_pred=np.round(y_score)
+        ),
+        # 'pre_curve' : sklearn.metrics.precision_recall_curve,
+        # 'roc_curve' :  sklearn.metrics.roc_curve,
+    }
+    return metrics
+
+
+def calculate_metrics(targets, scores):
+    scores = np.concatenate(scores, axis=0)
+    targets = np.concatenate(targets, axis=0)
+
+    metrics = define_metrics()
+
+    # print("Compute time for validation metric : ", end="")
+    # first_it = True
+    validation_results = {}
+    for metric_name, metric_function in metrics.items():
+        # if first_it:
+        #     first_it = False
+        # else:
+        #     print(", ", end="")
+        # metric_compute_start = time_wrap(False)
+        try:
+            validation_results[metric_name] = metric_function(targets, scores)
+        except Exception as error:
+            validation_results[metric_name] = -1
+            print("{} in calculating {}".format(error, metric_name))
+        # metric_compute_end = time_wrap(False)
+        # met_time = metric_compute_end - metric_compute_start
+        # print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
+        #      end="")
+    # print(" ms")
+    return validation_results
+
+
+if __name__ == "__main__":
+    import argparse
+
+    ### import packages ###
+    import sys
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+    parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
+    parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
+    parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
+    parser.add_argument("--arch-interaction-op", type=str, default="dot")
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    parser.add_argument("--weighted-pooling", type=str, default=None)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument(
+        "--data-generation", type=str, default="random"
+    )  # or synthetic or dataset
+    parser.add_argument(
+        "--rand-data-dist", type=str, default="uniform"
+    )  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    parser.add_argument("--caffe2-net-type", type=str, default="")
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="sgd",
+        help="""This is the optimizer for embedding tables.""",
+    )
+    parser.add_argument(
+        "--dataset-multiprocessing",
+        action="store_true",
+        default=False,
+        help="The Kaggle dataset can be multiprocessed in an environment \
+                        with more than 7 CPU cores and more than 20 GB of memory. \n \
+                        The Terabyte dataset can be multiprocessed in an environment \
+                        with more than 24 CPU cores and at least 1 TB of memory.",
+    )
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # onnx (or protobuf with shapes)
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    parser.add_argument("--save-proto-types-shapes", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    args = parser.parse_args()
+
+    if args.dataset_multiprocessing:
+        assert sys.version_info[0] >= 3 and sys.version_info[1] > 7, (
+            "The dataset_multiprocessing "
+            + "flag is susceptible to a bug in Python 3.7 and under. "
+            + "https://github.com/facebookresearch/dlrm/issues/172"
+        )
+
+    ### some basic setup ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    np.random.seed(args.numpy_rand_seed)
+
+    np.set_printoptions(precision=args.print_precision)
+    if args.test_mini_batch_size < 0:
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if args.test_num_workers < 0:
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+
+    use_gpu = args.use_gpu
+    if use_gpu:
+        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
+        ngpus = workspace.NumGpuDevices()  # 1
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device_opt = core.DeviceOption(caffe2_pb2.CPU)
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    if args.data_generation == "dataset":
+        if args.num_workers > 0 or args.test_num_workers > 0:
+            print(
+                "WARNING: non default --num-workers or --test-num-workers options"
+                + " are not supported and will be ignored"
+            )
+        if args.mini_batch_size != args.test_mini_batch_size:
+            print(
+                "WARNING: non default ----test-mini-batch-size option"
+                + " is not supported and will be ignored"
+            )
+
+        # input and target from dataset
+
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(
+            args,
+            offset_to_length_converter=True,
+        )
+
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        m_den = train_data.m_den
+
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(
+                list(
+                    map(
+                        lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                        ln_emb,
+                    )
+                )
+            )
+        ln_bot[0] = m_den
+
+    else:
+        if args.num_workers > 0 or args.test_num_workers > 0:
+            print(
+                "WARNING: non default --num-workers or --test-num-workers options"
+                + " are not supported and will be ignored"
+            )
+        if args.mini_batch_size != args.test_mini_batch_size:
+            print(
+                "WARNING: non default ----test-mini-batch-size option"
+                + " is not supported and will be ignored"
+            )
+
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(
+            args,
+            ln_emb,
+            m_den,
+            offset_to_length_converter=True,
+        )
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+        # table_feature_map = {idx : idx for idx in range(len(ln_emb))}
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    ln_emb = np.asarray(ln_emb)
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if args.arch_interaction_itself:
+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+        else:
+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit(
+            "ERROR: --arch-interaction-op="
+            + args.arch_interaction_op
+            + " is not supported"
+        )
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit(
+            "ERROR: arch-dense-feature-size "
+            + str(m_den)
+            + " does not match first dim of bottom mlp "
+            + str(ln_bot[0])
+        )
+    if m_spa != m_den_out:
+        sys.exit(
+            "ERROR: arch-sparse-feature-size "
+            + str(m_spa)
+            + " does not match last dim of bottom mlp "
+            + str(m_den_out)
+        )
+    if num_int != ln_top[0]:
+        sys.exit(
+            "ERROR: # of feature interactions "
+            + str(num_int)
+            + " does not match first dim of top mlp "
+            + str(ln_top[0])
+        )
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print(
+            "mlp top arch "
+            + str(ln_top.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_top)
+
+        print("# of interactions")
+        print(num_int)
+        print(
+            "mlp bot arch "
+            + str(ln_bot.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print(
+            "# of embeddings (= # of sparse features) "
+            + str(ln_emb.size)
+            + ", with dimensions "
+            + str(m_spa)
+            + "x:"
+        )
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, inputBatch in enumerate(train_ld):
+            lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
+            print("mini-batch: %d" % j)
+            print(lX_j)
+            print(lS_l_j)
+            print(lS_i_j)
+            print(lT_j)
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+    flag_types_shapes = args.save_onnx or args.save_proto_types_shapes
+    flag_forward_ops = not (use_gpu and ndevices > 1)
+    with core.DeviceScope(device_opt):
+        dlrm = DLRM_Net(
+            m_spa,
+            ln_emb,
+            ln_bot,
+            ln_top,
+            args.arch_interaction_op,
+            arch_interaction_itself=args.arch_interaction_itself,
+            sigmoid_bot=-1,
+            sigmoid_top=ln_top.size - 1,
+            save_onnx=flag_types_shapes,
+            ndevices=ndevices,
+            # forward_ops = flag_forward_ops
+            enable_prof=args.enable_profiling,
+            weighted_pooling=args.weighted_pooling,
+            emb_optimizer=args.optimizer,
+        )
+    # load nccl if using multiple devices
+    if args.sync_dense_params and ndevices > 1:
+        dyndep.InitOpsLibrary("//caffe2/caffe2/contrib/nccl:nccl_ops")
+    # set the net type for better performance (dag, async_scheduling, etc)
+    if args.caffe2_net_type:
+        dlrm.parameters().net.Proto().type = args.caffe2_net_type
+    # plot compute graph
+    if args.plot_compute_graph:
+        graph = net_drawer.GetPydotGraph(
+            dlrm.parameters().net, "dlrm_s_caffe2_graph", "BT"
+        )
+        graph.write_pdf(graph.get_name() + ".pdf")
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # add training loss if needed
+    if not args.inference_only:
+        with core.DeviceScope(device_opt):
+            # specify the loss function
+            nd = 1.0 if dlrm.ndevices <= 1 else 1.0 / dlrm.ndevices  # 1
+            if args.loss_function == "mse":
+                dlrm.MSEloss(scale=nd)
+            elif args.loss_function == "bce":
+                dlrm.BCEloss(scale=nd, threshold=args.loss_threshold)
+            else:
+                sys.exit(
+                    "ERROR: --loss-function=" + args.loss_function + " is not supported"
+                )
+
+            # define test net (as train net without gradients)
+            dlrm.test_net = core.Net(copy.deepcopy(dlrm.model.net.Proto()))
+
+            # specify the optimizer algorithm
+            if args.optimizer == "sgd":
+                dlrm.sgd_optimizer(
+                    args.learning_rate, sync_dense_params=args.sync_dense_params
+                )
+            elif args.optimizer in ["adagrad", "rwsadagrad"]:
+                dlrm.adagrad_optimizer(
+                    args.learning_rate, sync_dense_params=args.sync_dense_params
+                )
+            else:
+                sys.exit(
+                    """ERROR: Select an optimizer for
+                                embedding tables : 'sgd', 'adagrad',
+                                or 'rwsadagrad' """
+                )
+
+    # init/create
+    X, lS_l, lS_i, T = next(
+        iter(train_ld)
+    )  # does not affect the enumerate(train_ld) in the main loop
+    dlrm.create(X, lS_l, lS_i, T.int())
+
+    ### main loop ###
+    best_gA_test = 0
+    best_auc_test = 0
+    total_time = 0
+    total_loss = 0
+    total_accu = 0
+    total_iter = 0
+    total_samp = 0
+    k = 0
+
+    print("time/loss/accuracy (if enabled):")
+    while k < args.nepochs:
+        j = 0
+        for j, inputBatch in enumerate(train_ld):
+            # forward and backward pass, where the latter runs only
+            # when gradients and loss have been added to the net
+            time1 = time.time()
+            lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
+            lT_j = lT_j.int() if args.loss_function == "bce" else lT_j
+            dlrm.run(lX_j, lS_l_j, lS_i_j, lT_j)
+
+            time2 = time.time()
+            total_time += time2 - time1
+
+            # compte loss and accuracy
+            Z = dlrm.get_output()  # numpy array
+            T = lT_j.numpy()
+            """
+            # debug prints
+            print("output and loss")
+            print(Z)
+            print(dlrm.get_loss())
+            """
+            mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+            A = np.sum((np.round(Z, 0) == T).astype(np.uint8))
+            total_accu += 0 if args.inference_only else A
+            total_loss += 0 if args.inference_only else dlrm.get_loss() * mbs
+            total_iter += 1
+            total_samp += mbs
+
+            # print time, loss and accuracy
+            should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
+            should_test = (
+                (args.test_freq > 0)
+                and (args.data_generation in ["dataset", "random"])
+                and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+            )
+            if should_print or should_test:
+                gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                total_time = 0
+
+                gA = total_accu / total_samp
+                total_accu = 0
+
+                gL = total_loss / total_samp
+                total_loss = 0
+
+                str_run_type = "inference" if args.inference_only else "training"
+                print(
+                    "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
+                        str_run_type, j + 1, nbatches, k, gT
+                    )
+                    + " loss {:.6f}".format(gL)
+                )
+                total_iter = 0
+                total_samp = 0
+                # debug prints
+                # print(Z)
+                # print(T)
+
+                # testing
+                if should_test and not args.inference_only:
+                    # don't measure training iter time in a test iteration
+                    if args.mlperf_logging:
+                        previous_iteration_time = None
+
+                    test_accu = 0
+                    test_loss = 0
+                    test_samp = 0
+
+                    if args.mlperf_logging:
+                        scores = []
+                        targets = []
+
+                    for i, testBatch in enumerate(test_ld):
+                        # early exit if nbatches was set by the user and was exceeded
+                        if nbatches > 0 and i >= nbatches:
+                            break
+
+                        # forward pass
+
+                        lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i = testBatch
+                        lT_test_i = (
+                            lT_test_i.int()
+                            if args.loss_function == "bce"
+                            else lT_test_i
+                        )
+                        dlrm.run(
+                            lX_test_i,
+                            lS_l_test_i,
+                            lS_i_test_i,
+                            lT_test_i,
+                            test_net=True,
+                        )
+
+                        Z_test = dlrm.get_output()
+                        T_test = lT_test_i.numpy()
+
+                        if args.mlperf_logging:
+                            scores.append(Z_test)
+                            targets.append(T_test)
+                        else:
+                            # compte loss and accuracy
+                            L_test = dlrm.get_loss()
+                            mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                            A_test = np.sum(
+                                (np.round(Z_test, 0) == T_test).astype(np.uint8)
+                            )
+                            test_accu += A_test
+                            test_loss += L_test * mbs_test
+                            test_samp += mbs_test
+
+                    # compute metrics (after test loop has finished)
+                    if args.mlperf_logging:
+                        validation_results = calculate_metrics(targets, scores)
+                        gA_test = validation_results["accuracy"]
+                        gL_test = validation_results["loss"]
+                    else:
+                        gA_test = test_accu / test_samp
+                        gL_test = test_loss / test_samp
+
+                    # print metrics
+                    is_best = gA_test > best_gA_test
+                    if is_best:
+                        best_gA_test = gA_test
+
+                    if args.mlperf_logging:
+                        is_best = validation_results["roc_auc"] > best_auc_test
+                        if is_best:
+                            best_auc_test = validation_results["roc_auc"]
+
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                            + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format(
+                                validation_results["loss"],
+                                validation_results["recall"],
+                                validation_results["precision"],
+                            )
+                            + " f1 {:.4f}, ap {:.4f},".format(
+                                validation_results["f1"],
+                                validation_results["ap"],
+                            )
+                            + " auc {:.4f}, best auc {:.4f},".format(
+                                validation_results["roc_auc"], best_auc_test
+                            )
+                            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                                validation_results["accuracy"] * 100, best_gA_test * 100
+                            )
+                        )
+                    else:
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
+                            + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
+                                gL_test, gA_test * 100, best_gA_test * 100
+                            )
+                        )
+
+                    # check thresholds
+                    if (
+                        args.mlperf_logging
+                        and (args.mlperf_acc_threshold > 0)
+                        and (best_gA_test > args.mlperf_acc_threshold)
+                    ):
+                        print(
+                            "MLPerf testing accuracy threshold "
+                            + str(args.mlperf_acc_threshold)
+                            + " reached, stop training"
+                        )
+                        break
+
+                    if (
+                        args.mlperf_logging
+                        and (args.mlperf_auc_threshold > 0)
+                        and (best_auc_test > args.mlperf_auc_threshold)
+                    ):
+                        print(
+                            "MLPerf testing auc threshold "
+                            + str(args.mlperf_auc_threshold)
+                            + " reached, stop training"
+                        )
+                        break
+
+            j += 1  # nbatches
+        k += 1  # nepochs
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # build onnx model from caffe2
+    if args.save_onnx:
+        pnet = dlrm.parameters().net.Proto()
+        inet = dlrm.parameters().param_init_net.Proto()
+        value_info = dlrm.onnx_tsd  # None
+        # debug prints
+        # print(value_info)
+
+        # WARNING: Why Caffe2 to ONNX net transformation currently does not work?
+        # 1. ONNX does not support SparseLengthsSum operator directly. A workaround
+        # could be for the Caffe2 ONNX frontend to indirectly map this operator to
+        # Gather and ReducedSum ONNX operators, following the PyTorch approach.
+        c2f = caffe2.python.onnx.frontend.Caffe2Frontend()
+        dlrm_caffe2_onnx = c2f.caffe2_net_to_onnx_model(pnet, inet, value_info)
+        # check the onnx model
+        onnx.checker.check_model(dlrm_caffe2_onnx)
+
+        # save model to a file
+        with open("dlrm_s_caffe2.onnx", "w+") as dlrm_caffe2_onnx_file:
+            dlrm_caffe2_onnx_file.write(str(dlrm_caffe2_onnx))
+
+    # build protobuf with types and shapes
+    if args.save_proto_types_shapes:
+        # add types and shapes to protobuf
+        __TYPE_MAPPING = {
+            onnx.TensorProto.FLOAT: caffe2_pb2.TensorProto.FLOAT,
+            onnx.TensorProto.UINT8: caffe2_pb2.TensorProto.UINT8,
+            onnx.TensorProto.INT8: caffe2_pb2.TensorProto.INT8,
+            onnx.TensorProto.UINT16: caffe2_pb2.TensorProto.UINT16,
+            onnx.TensorProto.INT16: caffe2_pb2.TensorProto.INT16,
+            onnx.TensorProto.INT32: caffe2_pb2.TensorProto.INT32,
+            onnx.TensorProto.INT64: caffe2_pb2.TensorProto.INT64,
+            onnx.TensorProto.STRING: caffe2_pb2.TensorProto.STRING,
+            onnx.TensorProto.BOOL: caffe2_pb2.TensorProto.BOOL,
+            onnx.TensorProto.FLOAT16: caffe2_pb2.TensorProto.FLOAT16,
+            onnx.TensorProto.DOUBLE: caffe2_pb2.TensorProto.DOUBLE,
+        }
+
+        pnet = dlrm.parameters().net.Proto()
+        arg = pnet.arg.add()
+        arg.name = "input_shape_info"
+        for i in pnet.external_input:
+            if i in dlrm.onnx_tsd:
+                onnx_dtype, shape = dlrm.onnx_tsd[i]
+                t = arg.tensors.add()
+                t.name = i
+                t.data_type = __TYPE_MAPPING[onnx_dtype]
+                t.dims.extend(shape)
+            else:
+                print("Warning: we don't have shape/type info for input: {}".format(i))
+        # debug print
+        # print(pnet)
+
+        # export the protobuf with types and shapes
+        with open("dlrm_s_caffe2.proto", "w+") as dlrm_s_proto_file:
+            dlrm_s_proto_file.write(str(pnet))
+
+        """
+        # export the protobuf with types and shapes as well as weights
+        # see https://github.com/pytorch/pytorch/issues/9533
+        #save
+        net = dlrm.parameters().net
+        params = dlrm.parameters().params
+        init_net, predict_net = mobile_exporter.Export(workspace, net, params)
+        with open("dlrm_s_caffe2.predict", "wb") as dlrm_s_predict_file:
+            dlrm_s_predict_file.write(predict_net.SerializeToString())
+        with open("dlrm_s_caffe2.init", "wb") as dlrm_s_init_file:
+            dlrm_s_init_file.write(init_net.SerializeToString())
+        #load
+        net_def = caffe2_pb2.NetDef()
+        init_def= caffe2_pb2.NetDef()
+        with open("dlrm_s_caffe2.predict", "rb") as dlrm_s_predict_file:
+            net_def.ParseFromString(dlrm_s_predict_file.read())
+            print(net_def)
+        with open("dlrm_s_caffe2.init", "rb") as dlrm_s_init_file:
+            init_def.ParseFromString(dlrm_s_init_file.read())
+            print(init_def)
+        """
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+# miscellaneous
+import builtins
+import datetime
+import json
+import sys
+import time
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+
+# data generation
+import dlrm_data_pytorch as dp
+
+# For distributed run
+import extend_distributed as ext_dist
+import mlperf_logger
+
+# numpy
+import numpy as np
+import optim.rwsadagrad as RowWiseSparseAdagrad
+import sklearn.metrics
+
+# pytorch
+import torch
+import torch.nn as nn
+
+# dataloader
+try:
+    from internals import fbDataLoader, fbInputBatchFormatter
+
+    has_internal_libs = True
+except ImportError:
+    has_internal_libs = False
+
+from torch._ops import ops
+from torch.autograd.profiler import record_function
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+from torch.nn.parameter import Parameter
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.tensorboard import SummaryWriter
+
+# mixed-dimension trick
+from tricks.md_embedding_bag import md_solver, PrEmbeddingBag
+
+# quotient-remainder trick
+from tricks.qr_embedding_bag import QREmbeddingBag
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    try:
+        import onnx
+    except ImportError as error:
+        print("Unable to import onnx. ", error)
+
+# from torchviz import make_dot
+# import torch.nn.functional as Functional
+# from torch.nn.parameter import Parameter
+
+exc = getattr(builtins, "IOError", "FileNotFoundError")
+
+
+def time_wrap(use_gpu):
+    if use_gpu:
+        torch.cuda.synchronize()
+    return time.time()
+
+
+def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1):
+    with record_function("DLRM forward"):
+        if use_gpu:  # .cuda()
+            # lS_i can be either a list of tensors or a stacked tensor.
+            # Handle each case below:
+            if ndevices == 1:
+                lS_i = (
+                    [S_i.to(device) for S_i in lS_i]
+                    if isinstance(lS_i, list)
+                    else lS_i.to(device)
+                )
+                lS_o = (
+                    [S_o.to(device) for S_o in lS_o]
+                    if isinstance(lS_o, list)
+                    else lS_o.to(device)
+                )
+        return dlrm(X.to(device), lS_o, lS_i)
+
+
+def loss_fn_wrap(Z, T, use_gpu, device):
+    with record_function("DLRM loss compute"):
+        if args.loss_function == "mse" or args.loss_function == "bce":
+            return dlrm.loss_fn(Z, T.to(device))
+        elif args.loss_function == "wbce":
+            loss_ws_ = dlrm.loss_ws[T.data.view(-1).long()].view_as(T).to(device)
+            loss_fn_ = dlrm.loss_fn(Z, T.to(device))
+            loss_sc_ = loss_ws_ * loss_fn_
+            return loss_sc_.mean()
+
+
+# The following function is a wrapper to avoid checking this multiple times in th
+# loop below.
+def unpack_batch(b):
+    if args.data_generation == "internal":
+        return fbInputBatchFormatter(b, args.data_size)
+    else:
+        # Experiment with unweighted samples
+        return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None
+
+
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
+
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            # approach 1
+            LL.weight.data = torch.tensor(W, requires_grad=True)
+            LL.bias.data = torch.tensor(bt, requires_grad=True)
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln, weighted_pooling=None):
+        emb_l = nn.ModuleList()
+        v_W_l = []
+        for i in range(0, ln.size):
+            if ext_dist.my_size > 1:
+                if i not in self.local_emb_indices:
+                    continue
+            n = ln[i]
+
+            # construct embedding operator
+            if self.qr_flag and n > self.qr_threshold:
+                EE = QREmbeddingBag(
+                    n,
+                    m,
+                    self.qr_collisions,
+                    operation=self.qr_operation,
+                    mode="sum",
+                    sparse=True,
+                )
+            elif self.md_flag and n > self.md_threshold:
+                base = max(m)
+                _m = m[i] if n > self.md_threshold else base
+                EE = PrEmbeddingBag(n, _m, base)
+                # use np initialization as below for consistency...
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
+                ).astype(np.float32)
+                EE.embs.weight.data = torch.tensor(W, requires_grad=True)
+            else:
+                EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
+                # initialize embeddings
+                # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+                ).astype(np.float32)
+                # approach 1
+                EE.weight.data = torch.tensor(W, requires_grad=True)
+                # approach 2
+                # EE.weight.data.copy_(torch.tensor(W))
+                # approach 3
+                # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
+            if weighted_pooling is None:
+                v_W_l.append(None)
+            else:
+                v_W_l.append(torch.ones(n, dtype=torch.float32))
+            emb_l.append(EE)
+        return emb_l, v_W_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        arch_interaction_op=None,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        sync_dense_params=True,
+        loss_threshold=0.0,
+        ndevices=-1,
+        qr_flag=False,
+        qr_operation="mult",
+        qr_collisions=0,
+        qr_threshold=200,
+        md_flag=False,
+        md_threshold=200,
+        weighted_pooling=None,
+        loss_function="bce",
+    ):
+        super(DLRM_Net, self).__init__()
+
+        if (
+            (m_spa is not None)
+            and (ln_emb is not None)
+            and (ln_bot is not None)
+            and (ln_top is not None)
+            and (arch_interaction_op is not None)
+        ):
+            # save arguments
+            self.ndevices = ndevices
+            self.output_d = 0
+            self.parallel_model_batch_size = -1
+            self.parallel_model_is_not_prepared = True
+            self.arch_interaction_op = arch_interaction_op
+            self.arch_interaction_itself = arch_interaction_itself
+            self.sync_dense_params = sync_dense_params
+            self.loss_threshold = loss_threshold
+            self.loss_function = loss_function
+            if weighted_pooling is not None and weighted_pooling != "fixed":
+                self.weighted_pooling = "learned"
+            else:
+                self.weighted_pooling = weighted_pooling
+            # create variables for QR embedding if applicable
+            self.qr_flag = qr_flag
+            if self.qr_flag:
+                self.qr_collisions = qr_collisions
+                self.qr_operation = qr_operation
+                self.qr_threshold = qr_threshold
+            # create variables for MD embedding if applicable
+            self.md_flag = md_flag
+            if self.md_flag:
+                self.md_threshold = md_threshold
+
+            # If running distributed, get local slice of embedding tables
+            if ext_dist.my_size > 1:
+                n_emb = len(ln_emb)
+                if n_emb < ext_dist.my_size:
+                    sys.exit(
+                        "only (%d) sparse features for (%d) devices, table partitions will fail"
+                        % (n_emb, ext_dist.my_size)
+                    )
+                self.n_global_emb = n_emb
+                self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(
+                    n_emb
+                )
+                self.local_emb_slice = ext_dist.get_my_slice(n_emb)
+                self.local_emb_indices = list(range(n_emb))[self.local_emb_slice]
+
+            # create operators
+            if ndevices <= 1:
+                self.emb_l, w_list = self.create_emb(m_spa, ln_emb, weighted_pooling)
+                if self.weighted_pooling == "learned":
+                    self.v_W_l = nn.ParameterList()
+                    for w in w_list:
+                        self.v_W_l.append(Parameter(w))
+                else:
+                    self.v_W_l = w_list
+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
+
+            # quantization
+            self.quantize_emb = False
+            self.emb_l_q = []
+            self.quantize_bits = 32
+
+            # specify the loss function
+            if self.loss_function == "mse":
+                self.loss_fn = torch.nn.MSELoss(reduction="mean")
+            elif self.loss_function == "bce":
+                self.loss_fn = torch.nn.BCELoss(reduction="mean")
+            elif self.loss_function == "wbce":
+                self.loss_ws = torch.tensor(
+                    np.fromstring(args.loss_weights, dtype=float, sep="-")
+                )
+                self.loss_fn = torch.nn.BCELoss(reduction="none")
+            else:
+                sys.exit(
+                    "ERROR: --loss-function=" + self.loss_function + " is not supported"
+                )
+
+    def apply_mlp(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+
+        ly = []
+        for k, sparse_index_group_batch in enumerate(lS_i):
+            sparse_offset_group_batch = lS_o[k]
+
+            # embedding lookup
+            # We are using EmbeddingBag, which implicitly uses sum operator.
+            # The embeddings are represented as tall matrices, with sum
+            # happening vertically across 0 axis, resulting in a row vector
+            # E = emb_l[k]
+
+            if v_W_l[k] is not None:
+                per_sample_weights = v_W_l[k].gather(0, sparse_index_group_batch)
+            else:
+                per_sample_weights = None
+
+            if self.quantize_emb:
+                s1 = self.emb_l_q[k].element_size() * self.emb_l_q[k].nelement()
+                s2 = self.emb_l_q[k].element_size() * self.emb_l_q[k].nelement()
+                print("quantized emb sizes:", s1, s2)
+
+                if self.quantize_bits == 4:
+                    QV = ops.quantized.embedding_bag_4bit_rowwise_offsets(
+                        self.emb_l_q[k],
+                        sparse_index_group_batch,
+                        sparse_offset_group_batch,
+                        per_sample_weights=per_sample_weights,
+                    )
+                elif self.quantize_bits == 8:
+                    QV = ops.quantized.embedding_bag_byte_rowwise_offsets(
+                        self.emb_l_q[k],
+                        sparse_index_group_batch,
+                        sparse_offset_group_batch,
+                        per_sample_weights=per_sample_weights,
+                    )
+
+                ly.append(QV)
+            else:
+                E = emb_l[k]
+                V = E(
+                    sparse_index_group_batch,
+                    sparse_offset_group_batch,
+                    per_sample_weights=per_sample_weights,
+                )
+
+                ly.append(V)
+
+        # print(ly)
+        return ly
+
+    #  using quantizing functions from caffe2/aten/src/ATen/native/quantized/cpu
+    def quantize_embedding(self, bits):
+        n = len(self.emb_l)
+        self.emb_l_q = [None] * n
+        for k in range(n):
+            if bits == 4:
+                self.emb_l_q[k] = ops.quantized.embedding_bag_4bit_prepack(
+                    self.emb_l[k].weight
+                )
+            elif bits == 8:
+                self.emb_l_q[k] = ops.quantized.embedding_bag_byte_prepack(
+                    self.emb_l[k].weight
+                )
+            else:
+                return
+        self.emb_l = None
+        self.quantize_emb = True
+        self.quantize_bits = bits
+
+    def interact_features(self, x, ly):
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            Z = torch.bmm(T, torch.transpose(T, 1, 2))
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = Z.view((batch_size, -1))
+            # approach 2: unique
+            _, ni, nj = Z.shape
+            # approach 1: tril_indices
+            # offset = 0 if self.arch_interaction_itself else -1
+            # li, lj = torch.tril_indices(ni, nj, offset=offset)
+            # approach 2: custom
+            offset = 1 if self.arch_interaction_itself else 0
+            li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+            lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+            Zflat = Z[:, li, lj]
+            # concatenate dense features and interactions
+            R = torch.cat([x] + [Zflat], dim=1)
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            R = torch.cat([x] + ly, dim=1)
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def forward(self, dense_x, lS_o, lS_i):
+        if ext_dist.my_size > 1:
+            # multi-node multi-device run
+            return self.distributed_forward(dense_x, lS_o, lS_i)
+        elif self.ndevices <= 1:
+            # single device run
+            return self.sequential_forward(dense_x, lS_o, lS_i)
+        else:
+            # single-node multi-device run
+            return self.parallel_forward(dense_x, lS_o, lS_i)
+
+    def distributed_forward(self, dense_x, lS_o, lS_i):
+        batch_size = dense_x.size()[0]
+        # WARNING: # of ranks must be <= batch size in distributed_forward call
+        if batch_size < ext_dist.my_size:
+            sys.exit(
+                "ERROR: batch_size (%d) must be larger than number of ranks (%d)"
+                % (batch_size, ext_dist.my_size)
+            )
+        if batch_size % ext_dist.my_size != 0:
+            sys.exit(
+                "ERROR: batch_size %d can not split across %d ranks evenly"
+                % (batch_size, ext_dist.my_size)
+            )
+
+        dense_x = dense_x[ext_dist.get_my_slice(batch_size)]
+        lS_o = lS_o[self.local_emb_slice]
+        lS_i = lS_i[self.local_emb_slice]
+
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit(
+                "ERROR: corrupted model input detected in distributed_forward call"
+            )
+
+        # embeddings
+        with record_function("DLRM embedding forward"):
+            ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
+
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each rank. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each rank.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in distributed_forward call")
+
+        a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
+
+        with record_function("DLRM bottom nlp forward"):
+            x = self.apply_mlp(dense_x, self.bot_l)
+
+        ly = a2a_req.wait()
+        ly = list(ly)
+
+        # interactions
+        with record_function("DLRM interaction forward"):
+            z = self.interact_features(x, ly)
+
+        # top mlp
+        with record_function("DLRM top nlp forward"):
+            p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def sequential_forward(self, dense_x, lS_o, lS_i):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def parallel_forward(self, dense_x, lS_o, lS_i):
+        ### prepare model (overwrite) ###
+        # WARNING: # of devices must be >= batch size in parallel_forward call
+        batch_size = dense_x.size()[0]
+        ndevices = min(self.ndevices, batch_size, len(self.emb_l))
+        device_ids = range(ndevices)
+        # WARNING: must redistribute the model if mini-batch size changes(this is common
+        # for last mini-batch, when # of elements in the dataset/batch size is not even
+        if self.parallel_model_batch_size != batch_size:
+            self.parallel_model_is_not_prepared = True
+
+        if self.parallel_model_is_not_prepared or self.sync_dense_params:
+            # replicate mlp (data parallelism)
+            self.bot_l_replicas = replicate(self.bot_l, device_ids)
+            self.top_l_replicas = replicate(self.top_l, device_ids)
+            self.parallel_model_batch_size = batch_size
+
+        if self.parallel_model_is_not_prepared:
+            # distribute embeddings (model parallelism)
+            t_list = []
+            w_list = []
+            for k, emb in enumerate(self.emb_l):
+                d = torch.device("cuda:" + str(k % ndevices))
+                t_list.append(emb.to(d))
+                if self.weighted_pooling == "learned":
+                    w_list.append(Parameter(self.v_W_l[k].to(d)))
+                elif self.weighted_pooling == "fixed":
+                    w_list.append(self.v_W_l[k].to(d))
+                else:
+                    w_list.append(None)
+            self.emb_l = nn.ModuleList(t_list)
+            if self.weighted_pooling == "learned":
+                self.v_W_l = nn.ParameterList(w_list)
+            else:
+                self.v_W_l = w_list
+            self.parallel_model_is_not_prepared = False
+
+        ### prepare input (overwrite) ###
+        # scatter dense features (data parallelism)
+        # print(dense_x.device)
+        dense_x = scatter(dense_x, device_ids, dim=0)
+        # distribute sparse features (model parallelism)
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in parallel_forward call")
+
+        t_list = []
+        i_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            t_list.append(lS_o[k].to(d))
+            i_list.append(lS_i[k].to(d))
+        lS_o = t_list
+        lS_i = i_list
+
+        ### compute results in parallel ###
+        # bottom mlp
+        # WARNING: Note that the self.bot_l is a list of bottom mlp modules
+        # that have been replicated across devices, while dense_x is a tuple of dense
+        # inputs that has been scattered across devices on the first (batch) dimension.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of dense_x.
+        x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
+        # debug prints
+        # print(x)
+
+        # embeddings
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
+        # debug prints
+        # print(ly)
+
+        # butterfly shuffle (implemented inefficiently for now)
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each device. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each device.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
+
+        t_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            y = scatter(ly[k], device_ids, dim=0)
+            t_list.append(y)
+        # adjust the list to be ordered per device
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+        # debug prints
+        # print(ly)
+
+        # interactions
+        z = []
+        for k in range(ndevices):
+            zk = self.interact_features(x[k], ly[k])
+            z.append(zk)
+        # debug prints
+        # print(z)
+
+        # top mlp
+        # WARNING: Note that the self.top_l is a list of top mlp modules that
+        # have been replicated across devices, while z is a list of interaction results
+        # that by construction are scattered across devices on the first (batch) dim.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of z.
+        p = parallel_apply(self.top_l_replicas, z, None, device_ids)
+
+        ### gather the distributed results ###
+        p0 = gather(p, self.output_d, dim=0)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z0 = torch.clamp(
+                p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z0 = p0
+
+        return z0
+
+
+def dash_separated_ints(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value
+            )
+
+    return value
+
+
+def dash_separated_floats(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            float(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of floats" % value
+            )
+
+    return value
+
+
+def inference(
+    args,
+    dlrm,
+    best_acc_test,
+    best_auc_test,
+    test_ld,
+    device,
+    use_gpu,
+    log_iter=-1,
+):
+    test_accu = 0
+    test_samp = 0
+
+    if args.mlperf_logging:
+        scores = []
+        targets = []
+
+    for i, testBatch in enumerate(test_ld):
+        # early exit if nbatches was set by the user and was exceeded
+        if nbatches > 0 and i >= nbatches:
+            break
+
+        X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch(
+            testBatch
+        )
+
+        # Skip the batch if batch size not multiple of total ranks
+        if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0:
+            print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
+            continue
+
+        # forward pass
+        Z_test = dlrm_wrap(
+            X_test,
+            lS_o_test,
+            lS_i_test,
+            use_gpu,
+            device,
+            ndevices=ndevices,
+        )
+        ### gather the distributed results on each rank ###
+        # For some reason it requires explicit sync before all_gather call if
+        # tensor is on GPU memory
+        if Z_test.is_cuda:
+            torch.cuda.synchronize()
+        (_, batch_split_lengths) = ext_dist.get_split_lengths(X_test.size(0))
+        if ext_dist.my_size > 1:
+            Z_test = ext_dist.all_gather(Z_test, batch_split_lengths)
+
+        if args.mlperf_logging:
+            S_test = Z_test.detach().cpu().numpy()  # numpy array
+            T_test = T_test.detach().cpu().numpy()  # numpy array
+            scores.append(S_test)
+            targets.append(T_test)
+        else:
+            with record_function("DLRM accuracy compute"):
+                # compute loss and accuracy
+                S_test = Z_test.detach().cpu().numpy()  # numpy array
+                T_test = T_test.detach().cpu().numpy()  # numpy array
+
+                mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
+
+                test_accu += A_test
+                test_samp += mbs_test
+
+    if args.mlperf_logging:
+        with record_function("DLRM mlperf sklearn metrics compute"):
+            scores = np.concatenate(scores, axis=0)
+            targets = np.concatenate(targets, axis=0)
+
+            metrics = {
+                "recall": lambda y_true, y_score: sklearn.metrics.recall_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "precision": lambda y_true, y_score: sklearn.metrics.precision_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "f1": lambda y_true, y_score: sklearn.metrics.f1_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "ap": sklearn.metrics.average_precision_score,
+                "roc_auc": sklearn.metrics.roc_auc_score,
+                "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+            }
+
+        validation_results = {}
+        for metric_name, metric_function in metrics.items():
+            validation_results[metric_name] = metric_function(targets, scores)
+            writer.add_scalar(
+                "mlperf-metrics-test/" + metric_name,
+                validation_results[metric_name],
+                log_iter,
+            )
+        acc_test = validation_results["accuracy"]
+    else:
+        acc_test = test_accu / test_samp
+        writer.add_scalar("Test/Acc", acc_test, log_iter)
+
+    model_metrics_dict = {
+        "nepochs": args.nepochs,
+        "nbatches": nbatches,
+        "nbatches_test": nbatches_test,
+        "state_dict": dlrm.state_dict(),
+        "test_acc": acc_test,
+    }
+
+    if args.mlperf_logging:
+        is_best = validation_results["roc_auc"] > best_auc_test
+        if is_best:
+            best_auc_test = validation_results["roc_auc"]
+            model_metrics_dict["test_auc"] = best_auc_test
+        print(
+            "recall {:.4f}, precision {:.4f},".format(
+                validation_results["recall"],
+                validation_results["precision"],
+            )
+            + " f1 {:.4f}, ap {:.4f},".format(
+                validation_results["f1"], validation_results["ap"]
+            )
+            + " auc {:.4f}, best auc {:.4f},".format(
+                validation_results["roc_auc"], best_auc_test
+            )
+            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                validation_results["accuracy"] * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+    else:
+        is_best = acc_test > best_acc_test
+        if is_best:
+            best_acc_test = acc_test
+        print(
+            " accuracy {:3.3f} %, best {:3.3f} %".format(
+                acc_test * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+    return model_metrics_dict, is_best
+
+
+def run():
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
+    )
+    # j will be replaced with the table number
+    parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    parser.add_argument(
+        "--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
+    )
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    parser.add_argument("--weighted-pooling", type=str, default=None)
+    # embedding table options
+    parser.add_argument("--md-flag", action="store_true", default=False)
+    parser.add_argument("--md-threshold", type=int, default=200)
+    parser.add_argument("--md-temperature", type=float, default=0.3)
+    parser.add_argument("--md-round-dims", action="store_true", default=False)
+    parser.add_argument("--qr-flag", action="store_true", default=False)
+    parser.add_argument("--qr-threshold", type=int, default=200)
+    parser.add_argument("--qr-operation", type=str, default="mult")
+    parser.add_argument("--qr-collisions", type=int, default=4)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
+    parser.add_argument(
+        "--loss-weights", type=dash_separated_floats, default="1.0-1.0"
+    )  # for wbce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument(
+        "--data-generation",
+        type=str,
+        choices=["random", "dataset", "internal"],
+        default="random",
+    )  # synthetic, dataset or internal
+    parser.add_argument(
+        "--rand-data-dist", type=str, default="uniform"
+    )  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    parser.add_argument("--optimizer", type=str, default="sgd")
+    parser.add_argument(
+        "--dataset-multiprocessing",
+        action="store_true",
+        default=False,
+        help="The Kaggle dataset can be multiprocessed in an environment \
+                        with more than 7 CPU cores and more than 20 GB of memory. \n \
+                        The Terabyte dataset can be multiprocessed in an environment \
+                        with more than 24 CPU cores and at least 1 TB of memory.",
+    )
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # quantize
+    parser.add_argument("--quantize-mlp-with-bit", type=int, default=32)
+    parser.add_argument("--quantize-emb-with-bit", type=int, default=32)
+    # onnx
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    # distributed
+    parser.add_argument("--local_rank", type=int, default=-1)
+    parser.add_argument("--dist-backend", type=str, default="")
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--print-wall-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    parser.add_argument("--tensor-board-filename", type=str, default="run_kaggle_pt")
+    # store/load model
+    parser.add_argument("--save-model", type=str, default="")
+    parser.add_argument("--load-model", type=str, default="")
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
+    parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False)
+    # mlperf gradient accumulation iterations
+    parser.add_argument("--mlperf-grad-accum-iter", type=int, default=1)
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+
+    global args
+    global nbatches
+    global nbatches_test
+    global writer
+    args = parser.parse_args()
+
+    if args.dataset_multiprocessing:
+        assert sys.version_info[0] >= 3 and sys.version_info[1] > 7, (
+            "The dataset_multiprocessing "
+            + "flag is susceptible to a bug in Python 3.7 and under. "
+            + "https://github.com/facebookresearch/dlrm/issues/172"
+        )
+
+    if args.mlperf_logging:
+        mlperf_logger.log_event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
+        mlperf_logger.log_start(
+            key=mlperf_logger.constants.INIT_START, log_all_ranks=True
+        )
+
+    if args.weighted_pooling is not None:
+        if args.qr_flag:
+            sys.exit("ERROR: quotient remainder with weighted pooling is not supported")
+        if args.md_flag:
+            sys.exit("ERROR: mixed dimensions with weighted pooling is not supported")
+    if args.quantize_emb_with_bit in [4, 8]:
+        if args.qr_flag:
+            sys.exit(
+                "ERROR: 4 and 8-bit quantization with quotient remainder is not supported"
+            )
+        if args.md_flag:
+            sys.exit(
+                "ERROR: 4 and 8-bit quantization with mixed dimensions is not supported"
+            )
+        if args.use_gpu:
+            sys.exit("ERROR: 4 and 8-bit quantization on GPU is not supported")
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+    torch.set_printoptions(precision=args.print_precision)
+    torch.manual_seed(args.numpy_rand_seed)
+
+    if args.test_mini_batch_size < 0:
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if args.test_num_workers < 0:
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+
+    use_gpu = args.use_gpu and torch.cuda.is_available()
+
+    if not args.debug_mode:
+        ext_dist.init_distributed(
+            local_rank=args.local_rank, use_gpu=use_gpu, backend=args.dist_backend
+        )
+
+    if use_gpu:
+        torch.cuda.manual_seed_all(args.numpy_rand_seed)
+        torch.backends.cudnn.deterministic = True
+        if ext_dist.my_size > 1:
+            ngpus = 1
+            device = torch.device("cuda", ext_dist.my_local_rank)
+        else:
+            ngpus = torch.cuda.device_count()
+            device = torch.device("cuda", 0)
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device = torch.device("cpu")
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    # input data
+
+    if args.mlperf_logging:
+        mlperf_logger.barrier()
+        mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP)
+        mlperf_logger.barrier()
+        mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START)
+        mlperf_logger.barrier()
+
+    if args.data_generation == "dataset":
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+        table_feature_map = {idx: idx for idx in range(len(train_data.counts))}
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(
+                list(
+                    map(
+                        lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                        ln_emb,
+                    )
+                )
+            )
+        else:
+            ln_emb = np.array(ln_emb)
+        m_den = train_data.m_den
+        ln_bot[0] = m_den
+    elif args.data_generation == "internal":
+        if not has_internal_libs:
+            raise Exception("Internal libraries are not available.")
+        NUM_BATCHES = 5000
+        nbatches = args.num_batches if args.num_batches > 0 else NUM_BATCHES
+        train_ld, feature_to_num_embeddings = fbDataLoader(args.data_size, nbatches)
+        ln_emb = np.array(list(feature_to_num_embeddings.values()))
+        m_den = ln_bot[0]
+    else:
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(
+            args, ln_emb, m_den
+        )
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+    args.ln_emb = ln_emb.tolist()
+    if args.mlperf_logging:
+        print("command line args: ", json.dumps(vars(args)))
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    ln_emb = np.asarray(ln_emb)
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if args.arch_interaction_itself:
+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+        else:
+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit(
+            "ERROR: --arch-interaction-op="
+            + args.arch_interaction_op
+            + " is not supported"
+        )
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit(
+            "ERROR: arch-dense-feature-size "
+            + str(m_den)
+            + " does not match first dim of bottom mlp "
+            + str(ln_bot[0])
+        )
+    if args.qr_flag:
+        if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
+            sys.exit(
+                "ERROR: 2 arch-sparse-feature-size "
+                + str(2 * m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+                + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
+            )
+        if args.qr_operation != "concat" and m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    else:
+        if m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    if num_int != ln_top[0]:
+        sys.exit(
+            "ERROR: # of feature interactions "
+            + str(num_int)
+            + " does not match first dimension of top mlp "
+            + str(ln_top[0])
+        )
+
+    # assign mixed dimensions if applicable
+    if args.md_flag:
+        m_spa = md_solver(
+            torch.tensor(ln_emb),
+            args.md_temperature,  # alpha
+            d0=m_spa,
+            round_dim=args.md_round_dims,
+        ).tolist()
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print(
+            "mlp top arch "
+            + str(ln_top.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_top)
+        print("# of interactions")
+        print(num_int)
+        print(
+            "mlp bot arch "
+            + str(ln_bot.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print(
+            "# of embeddings (= # of sparse features) "
+            + str(ln_emb.size)
+            + ", with dimensions "
+            + str(m_spa)
+            + "x:"
+        )
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, inputBatch in enumerate(train_ld):
+            X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+
+            torch.set_printoptions(precision=4)
+            # early exit if nbatches was set by the user and has been exceeded
+            if nbatches > 0 and j >= nbatches:
+                break
+            print("mini-batch: %d" % j)
+            print(X.detach().cpu())
+            # transform offsets to lengths when printing
+            print(
+                torch.IntTensor(
+                    [
+                        np.diff(
+                            S_o.detach().cpu().tolist() + list(lS_i[i].shape)
+                        ).tolist()
+                        for i, S_o in enumerate(lS_o)
+                    ]
+                )
+            )
+            print([S_i.detach().cpu() for S_i in lS_i])
+            print(T.detach().cpu())
+
+    global ndevices
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    global dlrm
+    dlrm = DLRM_Net(
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        arch_interaction_op=args.arch_interaction_op,
+        arch_interaction_itself=args.arch_interaction_itself,
+        sigmoid_bot=-1,
+        sigmoid_top=ln_top.size - 2,
+        sync_dense_params=args.sync_dense_params,
+        loss_threshold=args.loss_threshold,
+        ndevices=ndevices,
+        qr_flag=args.qr_flag,
+        qr_operation=args.qr_operation,
+        qr_collisions=args.qr_collisions,
+        qr_threshold=args.qr_threshold,
+        md_flag=args.md_flag,
+        md_threshold=args.md_threshold,
+        weighted_pooling=args.weighted_pooling,
+        loss_function=args.loss_function,
+    )
+
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+        # print(dlrm)
+
+    if use_gpu:
+        # Custom Model-Data Parallel
+        # the mlps are replicated and use data parallelism, while
+        # the embeddings are distributed and use model parallelism
+        dlrm = dlrm.to(device)  # .cuda()
+        if dlrm.ndevices > 1:
+            dlrm.emb_l, dlrm.v_W_l = dlrm.create_emb(
+                m_spa, ln_emb, args.weighted_pooling
+            )
+        else:
+            if dlrm.weighted_pooling == "fixed":
+                for k, w in enumerate(dlrm.v_W_l):
+                    dlrm.v_W_l[k] = w.cuda()
+
+    # distribute data parallel mlps
+    if ext_dist.my_size > 1:
+        if use_gpu:
+            device_ids = [ext_dist.my_local_rank]
+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids)
+            dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids)
+        else:
+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
+            dlrm.top_l = ext_dist.DDP(dlrm.top_l)
+
+    if not args.inference_only:
+        if use_gpu and args.optimizer in ["rwsadagrad", "adagrad"]:
+            sys.exit("GPU version of Adagrad is not supported by PyTorch.")
+        # specify the optimizer algorithm
+        opts = {
+            "sgd": torch.optim.SGD,
+            "rwsadagrad": RowWiseSparseAdagrad.RWSAdagrad,
+            "adagrad": torch.optim.Adagrad,
+        }
+
+        parameters = (
+            dlrm.parameters()
+            if ext_dist.my_size == 1
+            else [
+                {
+                    "params": [p for emb in dlrm.emb_l for p in emb.parameters()],
+                    "lr": args.learning_rate,
+                },
+                # TODO check this lr setup
+                # bottom mlp has no data parallelism
+                # need to check how do we deal with top mlp
+                {
+                    "params": dlrm.bot_l.parameters(),
+                    "lr": args.learning_rate,
+                },
+                {
+                    "params": dlrm.top_l.parameters(),
+                    "lr": args.learning_rate,
+                },
+            ]
+        )
+        optimizer = opts[args.optimizer](parameters, lr=args.learning_rate)
+        lr_scheduler = LRPolicyScheduler(
+            optimizer,
+            args.lr_num_warmup_steps,
+            args.lr_decay_start_step,
+            args.lr_num_decay_steps,
+        )
+
+    ### main loop ###
+
+    # training or inference
+    best_acc_test = 0
+    best_auc_test = 0
+    skip_upto_epoch = 0
+    skip_upto_batch = 0
+    total_time = 0
+    total_loss = 0
+    total_iter = 0
+    total_samp = 0
+
+    if args.mlperf_logging:
+        mlperf_logger.mlperf_submission_log("dlrm")
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.SEED, value=args.numpy_rand_seed
+        )
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=args.mini_batch_size
+        )
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+        if use_gpu:
+            if dlrm.ndevices > 1:
+                # NOTE: when targeting inference on multiple GPUs,
+                # load the model as is on CPU or GPU, with the move
+                # to multiple GPUs to be done in parallel_forward
+                ld_model = torch.load(args.load_model)
+            else:
+                # NOTE: when targeting inference on single GPU,
+                # note that the call to .to(device) has already happened
+                ld_model = torch.load(
+                    args.load_model,
+                    map_location=torch.device("cuda"),
+                    # map_location=lambda storage, loc: storage.cuda(0)
+                )
+        else:
+            # when targeting inference on CPU
+            ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
+        dlrm.load_state_dict(ld_model["state_dict"])
+        ld_j = ld_model["iter"]
+        ld_k = ld_model["epoch"]
+        ld_nepochs = ld_model["nepochs"]
+        ld_nbatches = ld_model["nbatches"]
+        ld_nbatches_test = ld_model["nbatches_test"]
+        ld_train_loss = ld_model["train_loss"]
+        ld_total_loss = ld_model["total_loss"]
+        if args.mlperf_logging:
+            ld_gAUC_test = ld_model["test_auc"]
+        ld_acc_test = ld_model["test_acc"]
+        if not args.inference_only:
+            optimizer.load_state_dict(ld_model["opt_state_dict"])
+            best_acc_test = ld_acc_test
+            total_loss = ld_total_loss
+            skip_upto_epoch = ld_k  # epochs
+            skip_upto_batch = ld_j  # batches
+        else:
+            args.print_freq = ld_nbatches
+            args.test_freq = 0
+
+        print(
+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
+            )
+        )
+        print(
+            "Training state: loss = {:.6f}".format(
+                ld_train_loss,
+            )
+        )
+        if args.mlperf_logging:
+            print(
+                "Testing state: accuracy = {:3.3f} %, auc = {:.3f}".format(
+                    ld_acc_test * 100, ld_gAUC_test
+                )
+            )
+        else:
+            print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100))
+
+    if args.inference_only:
+        # Currently only dynamic quantization with INT8 and FP16 weights are
+        # supported for MLPs and INT4 and INT8 weights for EmbeddingBag
+        # post-training quantization during the inference.
+        # By default we don't do the quantization: quantize_{mlp,emb}_with_bit == 32 (FP32)
+        assert args.quantize_mlp_with_bit in [
+            8,
+            16,
+            32,
+        ], "only support 8/16/32-bit but got {}".format(args.quantize_mlp_with_bit)
+        assert args.quantize_emb_with_bit in [
+            4,
+            8,
+            32,
+        ], "only support 4/8/32-bit but got {}".format(args.quantize_emb_with_bit)
+        if args.quantize_mlp_with_bit != 32:
+            if args.quantize_mlp_with_bit in [8]:
+                quantize_dtype = torch.qint8
+            else:
+                quantize_dtype = torch.float16
+            dlrm = torch.quantization.quantize_dynamic(
+                dlrm, {torch.nn.Linear}, quantize_dtype
+            )
+        if args.quantize_emb_with_bit != 32:
+            dlrm.quantize_embedding(args.quantize_emb_with_bit)
+            # print(dlrm)
+
+    print("time/loss/accuracy (if enabled):")
+
+    if args.mlperf_logging:
+        # LR is logged twice for now because of a compliance checker bug
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate
+        )
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS,
+            value=args.lr_num_warmup_steps,
+        )
+
+        # use logging keys from the official HP table and not from the logging library
+        mlperf_logger.log_event(
+            key="sgd_opt_base_learning_rate", value=args.learning_rate
+        )
+        mlperf_logger.log_event(
+            key="lr_decay_start_steps", value=args.lr_decay_start_step
+        )
+        mlperf_logger.log_event(
+            key="sgd_opt_learning_rate_decay_steps", value=args.lr_num_decay_steps
+        )
+        mlperf_logger.log_event(key="sgd_opt_learning_rate_decay_poly_power", value=2)
+
+    tb_file = "./" + args.tensor_board_filename
+    writer = SummaryWriter(tb_file)
+
+    ext_dist.barrier()
+    with torch.autograd.profiler.profile(
+        args.enable_profiling, use_cuda=use_gpu, record_shapes=True
+    ) as prof:
+        if not args.inference_only:
+            k = 0
+            total_time_begin = 0
+            while k < args.nepochs:
+                if args.mlperf_logging:
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_start(
+                        key=mlperf_logger.constants.BLOCK_START,
+                        metadata={
+                            mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1),
+                            mlperf_logger.constants.EPOCH_COUNT: 1,
+                        },
+                    )
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_start(
+                        key=mlperf_logger.constants.EPOCH_START,
+                        metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
+                    )
+
+                if k < skip_upto_epoch:
+                    continue
+
+                if args.mlperf_logging:
+                    previous_iteration_time = None
+
+                for j, inputBatch in enumerate(train_ld):
+                    if j == 0 and args.save_onnx:
+                        X_onnx, lS_o_onnx, lS_i_onnx, _, _, _ = unpack_batch(inputBatch)
+
+                    if j < skip_upto_batch:
+                        continue
+
+                    X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+
+                    if args.mlperf_logging:
+                        current_time = time_wrap(use_gpu)
+                        if previous_iteration_time:
+                            iteration_time = current_time - previous_iteration_time
+                        else:
+                            iteration_time = 0
+                        previous_iteration_time = current_time
+                    else:
+                        t1 = time_wrap(use_gpu)
+
+                    # early exit if nbatches was set by the user and has been exceeded
+                    if nbatches > 0 and j >= nbatches:
+                        break
+
+                    # Skip the batch if batch size not multiple of total ranks
+                    if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0:
+                        print(
+                            "Warning: Skiping the batch %d with size %d"
+                            % (j, X.size(0))
+                        )
+                        continue
+
+                    mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+
+                    # forward pass
+                    Z = dlrm_wrap(
+                        X,
+                        lS_o,
+                        lS_i,
+                        use_gpu,
+                        device,
+                        ndevices=ndevices,
+                    )
+
+                    if ext_dist.my_size > 1:
+                        T = T[ext_dist.get_my_slice(mbs)]
+                        W = W[ext_dist.get_my_slice(mbs)]
+
+                    # loss
+                    E = loss_fn_wrap(Z, T, use_gpu, device)
+
+                    # compute loss and accuracy
+                    L = E.detach().cpu().numpy()  # numpy array
+                    # training accuracy is not disabled
+                    # S = Z.detach().cpu().numpy()  # numpy array
+                    # T = T.detach().cpu().numpy()  # numpy array
+
+                    # # print("res: ", S)
+
+                    # # print("j, train: BCE ", j, L)
+
+                    # mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+                    # A = np.sum((np.round(S, 0) == T).astype(np.uint8))
+
+                    with record_function("DLRM backward"):
+                        # scaled error gradient propagation
+                        # (where we do not accumulate gradients across mini-batches)
+                        if (
+                            args.mlperf_logging
+                            and (j + 1) % args.mlperf_grad_accum_iter == 0
+                        ) or not args.mlperf_logging:
+                            optimizer.zero_grad()
+                        # backward pass
+                        E.backward()
+
+                        # optimizer
+                        if (
+                            args.mlperf_logging
+                            and (j + 1) % args.mlperf_grad_accum_iter == 0
+                        ) or not args.mlperf_logging:
+                            optimizer.step()
+                            lr_scheduler.step()
+
+                    if args.mlperf_logging:
+                        total_time += iteration_time
+                    else:
+                        t2 = time_wrap(use_gpu)
+                        total_time += t2 - t1
+
+                    total_loss += L * mbs
+                    total_iter += 1
+                    total_samp += mbs
+
+                    should_print = ((j + 1) % args.print_freq == 0) or (
+                        j + 1 == nbatches
+                    )
+                    should_test = (
+                        (args.test_freq > 0)
+                        and (args.data_generation in ["dataset", "random"])
+                        and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+                    )
+
+                    # print time, loss and accuracy
+                    if should_print or should_test:
+                        gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                        total_time = 0
+
+                        train_loss = total_loss / total_samp
+                        total_loss = 0
+
+                        str_run_type = (
+                            "inference" if args.inference_only else "training"
+                        )
+
+                        wall_time = ""
+                        if args.print_wall_time:
+                            wall_time = " ({})".format(time.strftime("%H:%M"))
+
+                        print(
+                            "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
+                                str_run_type, j + 1, nbatches, k, gT
+                            )
+                            + " loss {:.6f}".format(train_loss)
+                            + wall_time,
+                            flush=True,
+                        )
+
+                        log_iter = nbatches * k + j + 1
+                        writer.add_scalar("Train/Loss", train_loss, log_iter)
+
+                        total_iter = 0
+                        total_samp = 0
+
+                    # testing
+                    if should_test:
+                        epoch_num_float = (j + 1) / len(train_ld) + k + 1
+                        if args.mlperf_logging:
+                            mlperf_logger.barrier()
+                            mlperf_logger.log_start(
+                                key=mlperf_logger.constants.EVAL_START,
+                                metadata={
+                                    mlperf_logger.constants.EPOCH_NUM: epoch_num_float
+                                },
+                            )
+
+                        # don't measure training iter time in a test iteration
+                        if args.mlperf_logging:
+                            previous_iteration_time = None
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                        )
+                        model_metrics_dict, is_best = inference(
+                            args,
+                            dlrm,
+                            best_acc_test,
+                            best_auc_test,
+                            test_ld,
+                            device,
+                            use_gpu,
+                            log_iter,
+                        )
+
+                        if (
+                            is_best
+                            and not (args.save_model == "")
+                            and not args.inference_only
+                        ):
+                            model_metrics_dict["epoch"] = k
+                            model_metrics_dict["iter"] = j + 1
+                            model_metrics_dict["train_loss"] = train_loss
+                            model_metrics_dict["total_loss"] = total_loss
+                            model_metrics_dict["opt_state_dict"] = (
+                                optimizer.state_dict()
+                            )
+                            print("Saving model to {}".format(args.save_model))
+                            torch.save(model_metrics_dict, args.save_model)
+
+                        if args.mlperf_logging:
+                            mlperf_logger.barrier()
+                            mlperf_logger.log_end(
+                                key=mlperf_logger.constants.EVAL_STOP,
+                                metadata={
+                                    mlperf_logger.constants.EPOCH_NUM: epoch_num_float
+                                },
+                            )
+
+                        # Uncomment the line below to print out the total time with overhead
+                        # print("Total test time for this group: {}" \
+                        # .format(time_wrap(use_gpu) - accum_test_time_begin))
+
+                        if (
+                            args.mlperf_logging
+                            and (args.mlperf_acc_threshold > 0)
+                            and (best_acc_test > args.mlperf_acc_threshold)
+                        ):
+                            print(
+                                "MLPerf testing accuracy threshold "
+                                + str(args.mlperf_acc_threshold)
+                                + " reached, stop training"
+                            )
+                            break
+
+                        if (
+                            args.mlperf_logging
+                            and (args.mlperf_auc_threshold > 0)
+                            and (best_auc_test > args.mlperf_auc_threshold)
+                        ):
+                            print(
+                                "MLPerf testing auc threshold "
+                                + str(args.mlperf_auc_threshold)
+                                + " reached, stop training"
+                            )
+                            if args.mlperf_logging:
+                                mlperf_logger.barrier()
+                                mlperf_logger.log_end(
+                                    key=mlperf_logger.constants.RUN_STOP,
+                                    metadata={
+                                        mlperf_logger.constants.STATUS: mlperf_logger.constants.SUCCESS
+                                    },
+                                )
+                            break
+
+                if args.mlperf_logging:
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_end(
+                        key=mlperf_logger.constants.EPOCH_STOP,
+                        metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
+                    )
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_end(
+                        key=mlperf_logger.constants.BLOCK_STOP,
+                        metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1)},
+                    )
+                k += 1  # nepochs
+            if args.mlperf_logging and best_auc_test <= args.mlperf_auc_threshold:
+                mlperf_logger.barrier()
+                mlperf_logger.log_end(
+                    key=mlperf_logger.constants.RUN_STOP,
+                    metadata={
+                        mlperf_logger.constants.STATUS: mlperf_logger.constants.ABORTED
+                    },
+                )
+        else:
+            print("Testing for inference only")
+            inference(
+                args,
+                dlrm,
+                best_acc_test,
+                best_auc_test,
+                test_ld,
+                device,
+                use_gpu,
+            )
+
+    # profiling
+    if args.enable_profiling:
+        time_stamp = str(datetime.datetime.now()).replace(" ", "_")
+        with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f:
+            prof_f.write(
+                prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="self_cpu_time_total"
+                )
+            )
+        with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f:
+            prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total"))
+        prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json")
+        # print(prof.key_averages().table(sort_by="cpu_time_total"))
+
+    # plot compute graph
+    if args.plot_compute_graph:
+        sys.exit(
+            "ERROR: Please install pytorchviz package in order to use the"
+            + " visualization. Then, uncomment its import above as well as"
+            + " three lines below and run the code again."
+        )
+        # V = Z.mean() if args.inference_only else E
+        # dot = make_dot(V, params=dict(dlrm.named_parameters()))
+        # dot.render('dlrm_s_pytorch_graph') # write .pdf file
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+
+    # export the model in onnx
+    if args.save_onnx:
+        """
+        # workaround 1: tensor -> list
+        if torch.is_tensor(lS_i_onnx):
+            lS_i_onnx = [lS_i_onnx[j] for j in range(len(lS_i_onnx))]
+        # workaound 2: list -> tensor
+        lS_i_onnx = torch.stack(lS_i_onnx)
+        """
+        # debug prints
+        # print("inputs", X_onnx, lS_o_onnx, lS_i_onnx)
+        # print("output", dlrm_wrap(X_onnx, lS_o_onnx, lS_i_onnx, use_gpu, device))
+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        batch_size = X_onnx.shape[0]
+        print("X_onnx.shape", X_onnx.shape)
+        if torch.is_tensor(lS_o_onnx):
+            print("lS_o_onnx.shape", lS_o_onnx.shape)
+        else:
+            for oo in lS_o_onnx:
+                print("oo.shape", oo.shape)
+        if torch.is_tensor(lS_i_onnx):
+            print("lS_i_onnx.shape", lS_i_onnx.shape)
+        else:
+            for ii in lS_i_onnx:
+                print("ii.shape", ii.shape)
+
+        # name inputs and outputs
+        o_inputs = (
+            ["offsets"]
+            if torch.is_tensor(lS_o_onnx)
+            else ["offsets_" + str(i) for i in range(len(lS_o_onnx))]
+        )
+        i_inputs = (
+            ["indices"]
+            if torch.is_tensor(lS_i_onnx)
+            else ["indices_" + str(i) for i in range(len(lS_i_onnx))]
+        )
+        all_inputs = ["dense_x"] + o_inputs + i_inputs
+        # debug prints
+        print("inputs", all_inputs)
+
+        # create dynamic_axis dictionaries
+        do_inputs = (
+            [{"offsets": {1: "batch_size"}}]
+            if torch.is_tensor(lS_o_onnx)
+            else [
+                {"offsets_" + str(i): {0: "batch_size"}} for i in range(len(lS_o_onnx))
+            ]
+        )
+        di_inputs = (
+            [{"indices": {1: "batch_size"}}]
+            if torch.is_tensor(lS_i_onnx)
+            else [
+                {"indices_" + str(i): {0: "batch_size"}} for i in range(len(lS_i_onnx))
+            ]
+        )
+        dynamic_axes = {"dense_x": {0: "batch_size"}, "pred": {0: "batch_size"}}
+        for do in do_inputs:
+            dynamic_axes.update(do)
+        for di in di_inputs:
+            dynamic_axes.update(di)
+        # debug prints
+        print(dynamic_axes)
+        # export model
+        torch.onnx.export(
+            dlrm,
+            (X_onnx, lS_o_onnx, lS_i_onnx),
+            dlrm_pytorch_onnx_file,
+            verbose=True,
+            opset_version=11,
+            input_names=all_inputs,
+            output_names=["pred"],
+            dynamic_axes=dynamic_axes,
+            dynamo=False,
+        )
+        # recover the model back
+        dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx")
+        # check the onnx model
+        onnx.checker.check_model(dlrm_pytorch_onnx)
+    total_time_end = time_wrap(use_gpu)
+
+
+if __name__ == "__main__":
+    run()
--- a/extend_distributed.py
+++ b/extend_distributed.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import builtins
+import os
+import sys
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Function
+from torch.autograd.profiler import record_function
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+try:
+    import torch_ccl
+except ImportError as e:
+    # print(e)
+    torch_ccl = False
+
+try:
+    import torch_ucc
+except ImportError as e:
+    torch_ucc = False
+
+
+my_rank = -1
+my_size = -1
+my_local_rank = -1
+my_local_size = -1
+alltoall_supported = False
+a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "")
+
+myreq = None
+
+
+def env2int(env_list, default=-1):
+    for e in env_list:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+
+def get_my_slice(n):
+    k, m = divmod(n, my_size)
+    return slice(
+        my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1
+    )
+
+
+def get_split_lengths(n):
+    k, m = divmod(n, my_size)
+    if m == 0:
+        splits = None
+        my_len = k
+    else:
+        splits = [(k + 1) if i < m else k for i in range(my_size)]
+        my_len = splits[my_rank]
+    return (my_len, splits)
+
+
+def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
+    global myreq
+    global my_rank
+    global my_size
+    global my_local_rank
+    global my_local_size
+    global a2a_impl
+    global alltoall_supported
+
+    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
+    num_mpi_ranks = env2int(
+        ["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
+    )
+    if backend == "" and num_mpi_ranks > 1:
+        if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
+            backend = "ccl"
+        elif use_gpu and dist.is_nccl_available():
+            backend = "nccl"
+        elif dist.is_mpi_available():
+            backend = "mpi"
+        else:
+            print(
+                "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
+            )
+            backend = "gloo"
+
+    if backend != "":
+        # guess Rank and size
+        if rank == -1:
+            rank = env2int(
+                ["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
+            )
+        if size == -1:
+            size = env2int(
+                [
+                    "PMI_SIZE",
+                    "OMPI_COMM_WORLD_SIZE",
+                    "MV2_COMM_WORLD_SIZE",
+                    "WORLD_SIZE",
+                ],
+                1,
+            )
+        if not os.environ.get("RANK", None) and rank != -1:
+            os.environ["RANK"] = str(rank)
+        if not os.environ.get("WORLD_SIZE", None) and size != -1:
+            os.environ["WORLD_SIZE"] = str(size)
+        if not os.environ.get("MASTER_PORT", None):
+            os.environ["MASTER_PORT"] = "29500"
+        if not os.environ.get("MASTER_ADDR", None):
+            local_size = env2int(
+                [
+                    "MPI_LOCALNRANKS",
+                    "OMPI_COMM_WORLD_LOCAL_SIZE",
+                    "MV2_COMM_WORLD_LOCAL_SIZE",
+                ],
+                1,
+            )
+            if local_size != size and backend != "mpi":
+                print(
+                    "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
+                )
+                print(
+                    "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
+                )
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+
+    if size > 1:
+        if local_rank == -1:
+            my_local_rank = env2int(
+                [
+                    "MPI_LOCALRANKID",
+                    "OMPI_COMM_WORLD_LOCAL_RANK",
+                    "MV2_COMM_WORLD_LOCAL_RANK",
+                    "LOCAL_RANK",
+                ],
+                0,
+            )
+        else:
+            my_local_rank = local_rank
+        my_local_size = env2int(
+            [
+                "MPI_LOCALNRANKS",
+                "OMPI_COMM_WORLD_LOCAL_SIZE",
+                "MV2_COMM_WORLD_LOCAL_SIZE",
+            ],
+            1,
+        )
+        if use_gpu:
+            if my_local_size > torch.cuda.device_count():
+                print(
+                    "Not sufficient GPUs available... local_size = %d, ngpus = %d"
+                    % (my_local_size, torch.cuda.device_count())
+                )
+                sys.exit(1)
+            torch.cuda.set_device(my_local_rank)
+        dist.init_process_group(backend, rank=rank, world_size=size)
+        my_rank = dist.get_rank()
+        my_size = dist.get_world_size()
+        if my_rank == 0:
+            print("Running on %d ranks using %s backend" % (my_size, backend))
+        if hasattr(dist, "all_to_all_single"):
+            try:
+                t = torch.zeros([4])
+                if use_gpu:
+                    t = t.cuda()
+                dist.all_to_all_single(t, t)
+                alltoall_supported = True
+            except RuntimeError as err:
+                print("fail to enable all_to_all_single primitive: %s" % err)
+        if a2a_impl == "alltoall" and alltoall_supported == False:
+            print(
+                "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
+                % (a2a_impl, backend)
+            )
+            a2a_impl = "scatter"
+        if a2a_impl != "":
+            print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
+    else:
+        my_rank = 0
+        my_size = 1
+        my_local_rank = 0
+        my_local_size = 1
+    print_all(
+        "world size: %d, current rank: %d, local rank: %d"
+        % (my_size, my_rank, my_local_rank)
+    )
+    myreq = Request()
+
+
+class Request(object):
+    def __init__(self):
+        self.req = None
+        self.tensor = None
+        self.WaitFunction = All2All_Scatter_Wait
+
+    def wait(self):
+        ret = self.WaitFunction.apply(*self.tensor)
+        self.req = None
+        self.tensor = None
+        return ret
+
+
+class All2All_ScatterList_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            for j in range(table_split_lengths[i]):
+                out_tensor = inputs[0].new_empty(
+                    [a2a_info.local_batch_num, a2a_info.emb_dim]
+                )
+                scatter_list = (
+                    list(inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else []
+                )
+                req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
+                gather_list.append(out_tensor)
+                req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_inputs = myreq.tensor
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_ScatterList_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        a2a_info = ctx.a2a_info
+        grad_output = [t.contiguous() for t in grad_output]
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else [a2a_info.local_batch_num] * my_size
+        )
+        per_rank_table_splits = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_inputs = [
+            grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim])
+            for _ in range(a2a_info.local_table_num)
+        ]
+        req_list = []
+        ind = 0
+        for i in range(my_size):
+            for j in range(per_rank_table_splits[i]):
+                gather_list = (
+                    list(grad_inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else None
+                )
+                req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True)
+                req_list.append(req)
+                ind += 1
+        myreq.req = req_list
+        myreq.tensor = grad_inputs
+        return tuple(grad_output)
+
+
+class All2All_Scatter_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        input = torch.cat(inputs, dim=1)
+        scatter_list = list(input.split(batch_split_lengths, dim=0))
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            out_tensor = input.new_empty(
+                [a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim]
+            )
+            req = dist.scatter(
+                out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True
+            )
+            gather_list.append(out_tensor)
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        ctx.a2a_info = a2a_info
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_input = myreq.tensor
+        grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1)
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_Scatter_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        assert len(grad_output) == my_size
+        scatter_list = [t.contiguous() for t in grad_output]
+        a2a_info = ctx.a2a_info
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_input = grad_output[0].new_empty(
+            [a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num]
+        )
+        gather_list = list(grad_input.split(batch_split_lengths, dim=0))
+        req_list = []
+        for i in range(my_size):
+            req = dist.gather(
+                scatter_list[i],
+                gather_list if i == my_rank else [],
+                dst=i,
+                async_op=True,
+            )
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = grad_input
+        return grad_output
+
+
+class All2All_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        with record_function("DLRM alltoall_req_fwd_single"):
+            batch_split_lengths = a2a_info.global_batch_partition_slices
+            if batch_split_lengths:
+                batch_split_lengths = [
+                    m * a2a_info.emb_dim * a2a_info.local_table_num
+                    for m in batch_split_lengths
+                ]
+            table_split_lengths = a2a_info.global_table_wise_parition_slices
+            if table_split_lengths:
+                table_split_lengths = [
+                    a2a_info.local_batch_num * e * a2a_info.emb_dim
+                    for e in table_split_lengths
+                ]
+            input = torch.cat(inputs, dim=1).view([-1])
+            output = input.new_empty(
+                [
+                    a2a_info.global_table_num
+                    * a2a_info.local_batch_num
+                    * a2a_info.emb_dim
+                ]
+            )
+            req = dist.all_to_all_single(
+                output, input, table_split_lengths, batch_split_lengths, async_op=True
+            )
+
+            myreq.req = req
+            myreq.tensor = []
+            myreq.tensor.append(output)
+            myreq.tensor = tuple(myreq.tensor)
+            a2a_info.batch_split_lengths = batch_split_lengths
+            a2a_info.table_split_lengths = table_split_lengths
+            myreq.a2a_info = a2a_info
+            ctx.a2a_info = a2a_info
+            return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        with record_function("DLRM alltoall_req_bwd_single"):
+            a2a_info = ctx.a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            grad_input = myreq.tensor
+            grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split(
+                a2a_info.emb_dim, dim=1
+            )
+            grad_inputs = [gin.contiguous() for gin in grad_inputs]
+            myreq.tensor = None
+            return (None, *grad_inputs)
+
+
+class All2All_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        with record_function("DLRM alltoall_wait_fwd_single"):
+            a2a_info = myreq.a2a_info
+            ctx.a2a_info = a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            myreq.tensor = None
+            table_split_lengths = (
+                a2a_info.table_split_lengths
+                if a2a_info.table_split_lengths
+                else a2a_info.local_table_num
+                * a2a_info.local_batch_num
+                * a2a_info.emb_dim
+            )
+            outputs = output[0].split(table_split_lengths)
+            outputs = tuple(
+                [out.view([a2a_info.local_batch_num, -1]) for out in outputs]
+            )
+            return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        global myreq
+        with record_function("DLRM alltoall_wait_bwd_single"):
+            a2a_info = ctx.a2a_info
+            grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
+            grad_output = torch.cat(grad_outputs)
+            grad_input = grad_output.new_empty(
+                [a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim]
+            )
+            req = dist.all_to_all_single(
+                grad_input,
+                grad_output,
+                a2a_info.batch_split_lengths,
+                a2a_info.table_split_lengths,
+                async_op=True,
+            )
+            myreq.req = req
+            myreq.tensor = grad_input
+            return (grad_output,)
+
+
+class AllGather(Function):
+    @staticmethod
+    def forward(ctx, input, global_lengths, dim=0):
+        if not isinstance(global_lengths, (list, tuple)):
+            global_lengths = [global_lengths] * my_size
+
+        assert len(global_lengths) == my_size
+        assert global_lengths[my_rank] == input.size(dim)
+        local_start = sum(global_lengths[:my_rank])
+
+        output_size = list(input.size())
+
+        ctx.dim = dim
+        ctx.local_start = local_start
+        ctx.local_length = global_lengths[my_rank]
+
+        input = input.contiguous()
+        if dim == 0:
+            out_len = sum(global_lengths)
+            output_size[dim] = out_len
+            output = input.new_empty(output_size)
+            gather_list = list(output.split(global_lengths, dim=0))
+        else:
+            gather_list = [torch.empty_like(input) for _ in range(my_size)]
+            gather_list = []
+            for length in global_lengths:
+                output_size[dim] = length
+                gather_list.append(input.new_empty(output_size))
+
+        dist.all_gather(gather_list, input)
+
+        if dim != 0:
+            output = torch.cat(gather_list, dim=dim)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # print("Inside All2AllBackward")
+        dim = ctx.dim
+        start = ctx.local_start
+        length = ctx.local_length
+
+        grad_input = grad_output.narrow(dim, start, length)
+
+        return (grad_input, None, None)
+
+
+class All2AllInfo(object):
+    pass
+
+
+def alltoall(inputs, per_rank_table_splits):
+    global myreq
+    batch_size, emb_dim = inputs[0].size()
+    a2a_info = All2AllInfo()
+    a2a_info.local_table_num = len(inputs)
+    a2a_info.global_table_wise_parition_slices = per_rank_table_splits
+    (
+        a2a_info.local_batch_num,
+        a2a_info.global_batch_partition_slices,
+    ) = get_split_lengths(batch_size)
+    a2a_info.emb_dim = emb_dim
+    a2a_info.batch_size = batch_size
+    a2a_info.global_table_num = (
+        sum(per_rank_table_splits)
+        if per_rank_table_splits
+        else a2a_info.local_table_num * my_size
+    )
+
+    if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall":
+        # print("Using All2All_Req")
+        output = All2All_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Wait
+    elif a2a_impl == "" or a2a_impl == "scatter":
+        # print("Using All2All_Scatter_Req")
+        output = All2All_Scatter_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Scatter_Wait
+    elif a2a_impl == "scatter_list":
+        # print("Using All2All_ScatterList_Req")
+        output = All2All_ScatterList_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_ScatterList_Wait
+    else:
+        print(
+            "Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
+            "please use one of [alltoall, scatter, scatter_list]" % a2a_impl
+        )
+    return myreq
+
+
+def all_gather(input, lengths, dim=0):
+    if not lengths:
+        lengths = [input.size(0)] * my_size
+    return AllGather.apply(input, lengths, dim)
+
+
+def barrier():
+    if my_size > 1:
+        dist.barrier()
+
+
+# Override builtin print function to print only from rank 0
+orig_print = builtins.print
+
+
+def rank0_print(*args, **kwargs):
+    if my_rank <= 0 or kwargs.get("print_all", False):
+        orig_print(*args, **kwargs)
+
+
+builtins.print = rank0_print
+
+
+# Allow printing from all rank with explicit print_all
+def print_all(*args, **kwargs):
+    orig_print(*args, **kwargs)
--- a/images/image1.png
+++ b/images/image1.png