Initial commit

9c8a2a14 · xinghao · 9c8a2a14 · 9c8a2a14 · 9c8a2a14 · 9c8a2a14
Commit 9c8a2a14 authored Oct 21, 2025 by xinghao
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# Contributing to DLRM
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* in general, please maintain a consistent style with the rest of the code
+
+## License
+By contributing to DLRM, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/Dockerfile
+++ b/Dockerfile
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+RUN pip install torch==1.3.1
+
+WORKDIR /code
+ADD . .
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+DLRM：Deep Learning Recommendation Model for Personalization and Recommendation Systems
+=================================================================================
+简介
+------------
+一个深度学习推荐模型（DLRM）的实现。
+模型的输入由稠密特征和稀疏特征组成。前者是一个浮点值向量；后者是一组稀疏索引，用于查找嵌入表中的向量（这些嵌入表由浮点向量组成）。
+选取到的这些向量会被送入若干 多层感知机（MLP）网络（通常在示意图中用三角形表示），在某些情况下，这些向量之间还会通过特定的算子（Ops）进行交互
+
+```
+output:
+                    probability of a click
+model:                        |
+                             /\
+                            /__\
+                              |
+      _____________________> Op  <___________________
+    /                         |                      \
+   /\                        /\                      /\
+  /__\                      /__\           ...      /__\
+   |                          |                       |
+   |                         Op                      Op
+   |                    ____/__\_____           ____/__\____
+   |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+input:
+[ dense features ]     [sparse indices] , ..., [sparse indices]
+```
+对模型各层的更精确定义：
+
+1）MLP（多层感知机）的全连接层
+
+    z = f(y)
+    
+    y = Wx + b
+
+2）嵌入查找（针对一组稀疏索引 p=[p1,...,pk]p = [p_1, ..., p_k]p=[p1,...,pk]）
+
+    z = Op(e1,...,ek)
+    
+    obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+
+3）算子 Op 可以是以下几种之一
+
+    Sum(e1,...,ek) = e1 + ... + ek
+    
+    Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+    
+    Cat(e1,...,ek) = [e1', ..., ek']'
+    
+    where ' denotes transpose operation
+
+部署
+--------------
+### Docker
+
+**容器创建**
+
+```bash
+docker run --shm-size 500g --network=host --name=dlrm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /path/to/workspace/:/path/to/workspace/ -v /opt/hyhal:/opt/hyhal:ro -it image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10 bash
+```
+
+**依赖安装**
+
+```bash
+cd dlrm
+pip install -r requirements.txt
+pip install tensorboard
+```
+
+注意：使用 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 会导致 `torchrec-nightly` 相关依赖安装失败
+
+Demo
+--------------------
+
+1）使用微型模型运行代码
+
+```bash
+python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
+```
+
+<img src="./images/image1.png" width="900">
+
+2）在调试模式下使用微型模型运行代码
+
+```bash
+python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
+```
+
+<img src="./images/image2.png" width="900">
+
+测试
+-------
+
+验证代码功能正确性
+```bash
+./test/dlrm_s_test.sh
+```
+
+<img src="./images/image3.png" width="900">
+
+基准测试
+------------
+
+1）性能基准测试
+
+```bash
+./bench/dlrm_s_benchmark.sh
+```
+
+2）代码支持数据集 [Criteo Kaggle Display Advertising Challenge Dataset](https://ailab.criteo.com/ressources/)
+
+   - 请按以下步骤准备数据，以便在 DLRM 代码中使用：
+     
+     - 首先，指定下载好的原始数据文件（train.txt），使用参数 `--raw-data-file=<path/train.txt>`
+     - 然后对数据进行预处理（分类、跨天合并等），以便在 DLRM 代码中使用
+     - 预处理后的数据会存储为 `*.npz` 文件，路径为 `<root_dir>/input/*.npz`
+     - 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用，参数为 `--processed-data-file=<path/*.npz>`
+     
+- 可以使用以下脚本对模型进行训练
+
+  ```bash
+  ./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
+  ```
+
+  若要启用gpu，添加参数 `--use-gpu`，若要启用纯推理模型，添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
+
+<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
+
+3）代码支持数据集 [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
+
+   - 请按以下步骤准备数据，以便在 DLRM 代码中使用：
+     - 首先，下载原始数据文件 `day_0.gz` 到 `day_23.gz` 并解压
+     - 使用参数 `--raw-data-file=<path/day>` 指定解压后的文本文件位置 `day_0` 到 `day_23`（天数会自动追加）
+     - 然后对数据进行预处理（分类、跨天合并等），以便在 DLRM 代码中使用
+     - 预处理后的数据会存储为 `*.npz` 文件，路径为 `<root_dir>/input/*.npz`
+     - 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用，参数为 `--processed-data-file=<path/*.npz>`
+   - 可以使用以下脚本对模型进行训练
+
+```bash
+./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"]
+```
+
+	若要启用gpu，添加参数 `--use-gpu`，若要启用纯推理模型，添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
+
+- 对应的预训练模型可从以下链接下载：[dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)
+
+<img src="./terabyte_0875_loss_accuracy_plots.png" width="900" height="320">
+
+4）代码支持 [MLPerf benchmark](https://mlperf.org).
+
+   - 请参考以下训练参数
+   ```bash
+--mlperf-logging 用于跟踪多个指标，包括曲线下面积（AUC）
+
+--mlperf-acc-threshold 允许基于准确率指标提前停止训练
+
+--mlperf-auc-threshold 允许基于 AUC 指标提前停止训练
+
+--mlperf-bin-loader 启用将数据预处理成单个二进制文件
+
+--mlperf-bin-shuffle 控制是否对小批量数据进行随机打乱
+   ```
+   - MLPerf 模型可使用以下脚本进行训练。
+   ```bash
+./bench/run_and_time.sh [--use-gpu]
+   ```
+   - 对应的预训练模型可从以下链接下载：[dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt)
+
+5）该代码现在支持同步分布式训练，支持 gloo/nccl/mpi 后端，同时提供了 [PyTorch 分布式启动器](https://pytorch.org/docs/stable/distributed.html#launch-utility) 和 Mpirun 的启动方式。对于 MPI，用户需要自行编写 MPI 启动脚本来配置运行主机。例如，使用 PyTorch 分布式启动器，可以使用如下命令作为启动脚本：
+
+```bash
+# 在单节点 8 GPU 环境下，使用 NCCL 作为后端处理随机生成的数据集时：
+python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
+--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
+
+# 对于多节点环境，用户可以根据启动器手册添加相关参数，例如：
+--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
+```
+
+
+模型检查点保存/加载
+-------------------------------
+在训练过程中，可以使用参数 `--save-model=<path/model.pt>` 保存模型
+
+当测试准确率有所提升时（按 `--test-freq` 指定的间隔检查），模型会被保存
+
+已保存的模型可以通过 `--load-model=<path/model.pt>` 加载
+
+加载后，模型可以用于继续训练，已保存的模型相当于一个检查点或者，也可以通过指定 `--inference-only` 选项，仅使用保存的模型在测试数据集上进行评估
+
+
+参考资料
+-------
+https://github.com/facebookresearch/dlrm
--- a/bench/dlrm_s_benchmark.sh
+++ b/bench/dlrm_s_benchmark.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+cpu=1
+gpu=1
+pt=1
+
+ncores=28 #12 #6
+nsockets="0"
+
+ngpus="1 2 4 8"
+
+numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+data=random #synthetic
+print_freq=100
+rand_seed=727
+
+#Model param
+mb_size=2048 #1024 #512 #256
+nbatches=1000 #500 #100
+bot_mlp="512-512-64"
+top_mlp="1024-1024-1024-1"
+emb_size=64
+nindices=100
+emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
+interaction="dot"
+tnworkers=0
+tmb_size=16384
+
+#_args="--mini-batch-size="${mb_size}\
+_args=" --num-batches="${nbatches}\
+" --data-generation="${data}\
+" --arch-mlp-bot="${bot_mlp}\
+" --arch-mlp-top="${top_mlp}\
+" --arch-sparse-feature-size="${emb_size}\
+" --arch-embedding-size="${emb}\
+" --num-indices-per-lookup="${nindices}\
+" --arch-interaction-op="${interaction}\
+" --numpy-rand-seed="${rand_seed}\
+" --print-freq="${print_freq}\
+" --print-time"\
+" --enable-profiling "
+
+# CPU Benchmarking
+if [ $cpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "CPU Benchmarking - running on $ncores cores"
+  echo "--------------------------------------------"
+  if [ $pt = 1 ]; then
+    outf="model1_CPU_PT_$ncores.log"
+    outp="dlrm_s_pytorch.prof"
+    echo "-------------------------------"
+    echo "Running PT (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file(s)
+    mv $outp ${outf//".log"/".prof"}
+    mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+  fi
+fi
+
+# GPU Benchmarking
+if [ $gpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "GPU Benchmarking - running on $ngpus GPUs"
+  echo "--------------------------------------------"
+  for _ng in $ngpus
+  do
+    # weak scaling
+    # _mb_size=$((mb_size*_ng))
+    # strong scaling
+    _mb_size=$((mb_size*1))
+    _gpus=$(seq -s, 0 $((_ng-1)))
+    cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
+    echo "-------------------"
+    echo "Using GPUS: "$_gpus
+    echo "-------------------"
+    if [ $pt = 1 ]; then
+      outf="model1_GPU_PT_$_ng.log"
+      outp="dlrm_s_pytorch.prof"
+      echo "-------------------------------"
+      echo "Running PT (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file(s)
+      mv $outp ${outf//".log"/".prof"}
+      mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+    fi
+  done
+fi
\ No newline at end of file
--- a/bench/dlrm_s_criteo_kaggle.sh
+++ b/bench/dlrm_s_criteo_kaggle.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
+
+echo "done"
\ No newline at end of file
--- a/bench/dlrm_s_criteo_terabyte.sh
+++ b/bench/dlrm_s_criteo_terabyte.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
+
+echo "done"
\ No newline at end of file
--- a/bench/run_and_time.sh
+++ b/bench/run_and_time.sh
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
+
+echo "done"
--- a/cython/cython_compile.py
+++ b/cython/cython_compile.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: compile .so from python code
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from distutils.extension import Extension
+
+from Cython.Build import cythonize
+
+from setuptools import setup
+
+ext_modules = [
+    Extension(
+        "data_utils_cython",
+        ["data_utils_cython.pyx"],
+        extra_compile_args=["-O3"],
+        extra_link_args=["-O3"],
+    )
+]
+
+setup(name="data_utils_cython", ext_modules=cythonize(ext_modules))
--- a/cython/cython_criteo.py
+++ b/cython/cython_criteo.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Description: run dataset pre-processing in standalone mode
+# WARNING: These steps are required to work with Cython
+# 1. Instal Cython
+# > sudo yum install Cython
+# 2. Please copy data_utils.py into data_utils_cython.pyx
+# 3. Compile the data_utils_cython.pyx to generate .so
+# (it's important to keep extension .pyx rather than .py
+#  to ensure the C/C++ .so no .py is loaded at import time)
+# > python cython_compile.py build_ext --inplace
+# This should create data_utils_cython.so, which can be loaded below with "import"
+# 4. Run standalone datatset preprocessing to generate .npz files
+# a. Kaggle
+# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
+#   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
+# b. Terabyte
+# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
+#   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import data_utils_cython as duc
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Preprocess Criteo dataset")
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    duc.loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+    )
--- a/data_loader_terabyte.py
+++ b/data_loader_terabyte.py
+# @lint-ignore-every LICENSELINT
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import math
+
+import os
+import time
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+
+class DataLoader:
+    """
+    DataLoader dedicated for the Criteo Terabyte Click Logs dataset
+    """
+
+    def __init__(
+        self,
+        data_filename,
+        data_directory,
+        days,
+        batch_size,
+        max_ind_range=-1,
+        split="train",
+        drop_last_batch=False,
+    ):
+        self.data_filename = data_filename
+        self.data_directory = data_directory
+        self.days = days
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+
+        total_file = os.path.join(data_directory, data_filename + "_day_count.npz")
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"][np.array(days)]
+
+        self.length = sum(total_per_file)
+        if split == "test" or split == "val":
+            self.length = int(np.ceil(self.length / 2.0))
+        self.split = split
+        self.drop_last_batch = drop_last_batch
+
+    def __iter__(self):
+        return iter(
+            _batch_generator(
+                self.data_filename,
+                self.data_directory,
+                self.days,
+                self.batch_size,
+                self.split,
+                self.drop_last_batch,
+                self.max_ind_range,
+            )
+        )
+
+    def __len__(self):
+        if self.drop_last_batch:
+            return self.length // self.batch_size
+        else:
+            return math.ceil(self.length / self.batch_size)
+
+
+def _transform_features(
+    x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
+):
+    if max_ind_range > 0:
+        x_cat_batch = x_cat_batch % max_ind_range
+
+    if flag_input_torch_tensor:
+        x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
+        x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
+        y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
+    else:
+        x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
+        x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
+        y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
+
+    batch_size = x_cat_batch.shape[0]
+    feature_count = x_cat_batch.shape[1]
+    lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
+
+    return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
+
+
+def _batch_generator(
+    data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
+):
+    previous_file = None
+    for day in days:
+        filepath = os.path.join(
+            data_directory, data_filename + "_{}_reordered.npz".format(day)
+        )
+
+        # print('Loading file: ', filepath)
+        with np.load(filepath) as data:
+            x_int = data["X_int"]
+            x_cat = data["X_cat"]
+            y = data["y"]
+
+        samples_in_file = y.shape[0]
+        batch_start_idx = 0
+        if split == "test" or split == "val":
+            length = int(np.ceil(samples_in_file / 2.0))
+            if split == "test":
+                samples_in_file = length
+            elif split == "val":
+                batch_start_idx = samples_in_file - length
+
+        while batch_start_idx < samples_in_file - batch_size:
+            missing_samples = batch_size
+            if previous_file is not None:
+                missing_samples -= previous_file["y"].shape[0]
+
+            current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
+
+            x_int_batch = x_int[current_slice]
+            x_cat_batch = x_cat[current_slice]
+            y_batch = y[current_slice]
+
+            if previous_file is not None:
+                x_int_batch = np.concatenate(
+                    [previous_file["x_int"], x_int_batch], axis=0
+                )
+                x_cat_batch = np.concatenate(
+                    [previous_file["x_cat"], x_cat_batch], axis=0
+                )
+                y_batch = np.concatenate([previous_file["y"], y_batch], axis=0)
+                previous_file = None
+
+            if x_int_batch.shape[0] != batch_size:
+                raise ValueError("should not happen")
+
+            yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
+
+            batch_start_idx += missing_samples
+        if batch_start_idx != samples_in_file:
+            current_slice = slice(batch_start_idx, samples_in_file)
+            if previous_file is not None:
+                previous_file = {
+                    "x_int": np.concatenate(
+                        [previous_file["x_int"], x_int[current_slice]], axis=0
+                    ),
+                    "x_cat": np.concatenate(
+                        [previous_file["x_cat"], x_cat[current_slice]], axis=0
+                    ),
+                    "y": np.concatenate([previous_file["y"], y[current_slice]], axis=0),
+                }
+            else:
+                previous_file = {
+                    "x_int": x_int[current_slice],
+                    "x_cat": x_cat[current_slice],
+                    "y": y[current_slice],
+                }
+
+    if not drop_last:
+        yield _transform_features(
+            previous_file["x_int"],
+            previous_file["x_cat"],
+            previous_file["y"],
+            max_ind_range,
+        )
+
+
+def _test():
+    generator = _batch_generator(
+        data_filename="day",
+        data_directory="./input",
+        days=range(23),
+        split="train",
+        batch_size=2048,
+        drop_last=True,
+        max_ind_range=-1,
+    )
+    t1 = time.time()
+    for x_int, lS_o, x_cat, y in generator:
+        t2 = time.time()
+        time_diff = t2 - t1
+        t1 = t2
+        print(
+            "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
+                time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
+            )
+        )
+
+
+class CriteoBinDataset(Dataset):
+    """Binary version of criteo dataset."""
+
+    def __init__(
+        self,
+        data_file,
+        counts_file,
+        batch_size=1,
+        max_ind_range=-1,
+        bytes_per_feature=4,
+    ):
+        # dataset
+        self.tar_fea = 1  # single target
+        self.den_fea = 13  # 13 dense  features
+        self.spa_fea = 26  # 26 sparse features
+        self.tad_fea = self.tar_fea + self.den_fea
+        self.tot_fea = self.tad_fea + self.spa_fea
+
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        self.bytes_per_entry = bytes_per_feature * self.tot_fea * batch_size
+
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+
+        print("data file:", data_file, "number of batches:", self.num_entries)
+        self.file = open(data_file, "rb")
+
+        with np.load(counts_file) as data:
+            self.counts = data["counts"]
+
+        # hardcoded for now
+        self.m_den = 13
+
+    def __len__(self):
+        return self.num_entries
+
+    def __getitem__(self, idx):
+        self.file.seek(idx * self.bytes_per_entry, 0)
+        raw_data = self.file.read(self.bytes_per_entry)
+        array = np.frombuffer(raw_data, dtype=np.int32)
+        tensor = torch.from_numpy(array).view((-1, self.tot_fea))
+
+        return _transform_features(
+            x_int_batch=tensor[:, 1:14],
+            x_cat_batch=tensor[:, 14:],
+            y_batch=tensor[:, 0],
+            max_ind_range=self.max_ind_range,
+            flag_input_torch_tensor=True,
+        )
+
+    def __del__(self):
+        self.file.close()
+
+
+def numpy_to_binary(input_files, output_file_path, split="train"):
+    """Convert the data to a binary format to be read with CriteoBinDataset."""
+
+    # WARNING - both categorical and numerical data must fit into int32 for
+    # the following code to work correctly
+
+    with open(output_file_path, "wb") as output_file:
+        if split == "train":
+            for input_file in input_files:
+                print("Processing file: ", input_file)
+
+                np_data = np.load(input_file)
+                np_data = np.concatenate(
+                    [np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
+                    axis=1,
+                )
+                np_data = np_data.astype(np.int32)
+
+                output_file.write(np_data.tobytes())
+        else:
+            assert len(input_files) == 1
+            np_data = np.load(input_files[0])
+            np_data = np.concatenate(
+                [np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
+                axis=1,
+            )
+            np_data = np_data.astype(np.int32)
+
+            samples_in_file = np_data.shape[0]
+            midpoint = int(np.ceil(samples_in_file / 2.0))
+            if split == "test":
+                begin = 0
+                end = midpoint
+            elif split == "val":
+                begin = midpoint
+                end = samples_in_file
+            else:
+                raise ValueError("Unknown split value: ", split)
+
+            output_file.write(np_data[begin:end].tobytes())
+
+
+def _preprocess(args):
+    train_files = [
+        "{}_{}_reordered.npz".format(args.input_data_prefix, day)
+        for day in range(0, 23)
+    ]
+
+    test_valid_file = args.input_data_prefix + "_23_reordered.npz"
+
+    os.makedirs(args.output_directory, exist_ok=True)
+    for split in ["train", "val", "test"]:
+        print("Running preprocessing for split =", split)
+
+        output_file = os.path.join(args.output_directory, "{}_data.bin".format(split))
+
+        input_files = train_files if split == "train" else [test_valid_file]
+        numpy_to_binary(
+            input_files=input_files, output_file_path=output_file, split=split
+        )
+
+
+def _test_bin():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_directory", required=True)
+    parser.add_argument("--input_data_prefix", required=True)
+    parser.add_argument("--split", choices=["train", "test", "val"], required=True)
+    args = parser.parse_args()
+
+    _preprocess(args)
+
+    binary_data_file = os.path.join(
+        args.output_directory, "{}_data.bin".format(args.split)
+    )
+
+    counts_file = os.path.join(args.output_directory, "day_fea_count.npz")
+    dataset_binary = CriteoBinDataset(
+        data_file=binary_data_file,
+        counts_file=counts_file,
+        batch_size=2048,
+    )
+    from dlrm_data_pytorch import (
+        collate_wrapper_criteo_offset as collate_wrapper_criteo,
+        CriteoDataset,
+    )
+
+    binary_loader = torch.utils.data.DataLoader(
+        dataset_binary,
+        batch_size=None,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=None,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    original_dataset = CriteoDataset(
+        dataset="terabyte",
+        max_ind_range=10 * 1000 * 1000,
+        sub_sample_rate=1,
+        randomize=True,
+        split=args.split,
+        raw_path=args.input_data_prefix,
+        pro_data="dummy_string",
+        memory_map=True,
+    )
+
+    original_loader = torch.utils.data.DataLoader(
+        original_dataset,
+        batch_size=2048,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_wrapper_criteo,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    assert len(dataset_binary) == len(original_loader)
+    for i, (old_batch, new_batch) in tqdm(
+        enumerate(zip(original_loader, binary_loader)), total=len(dataset_binary)
+    ):
+        for j in range(len(new_batch)):
+            if not np.array_equal(old_batch[j], new_batch[j]):
+                raise ValueError("FAILED: Datasets not equal")
+        if i > len(dataset_binary):
+            break
+    print("PASSED")
+
+
+if __name__ == "__main__":
+    _test()
+    _test_bin()
--- a/data_utils.py
+++ b/data_utils.py
--- a/dlrm_data_caffe2.py
+++ b/dlrm_data_caffe2.py
--- a/dlrm_data_pytorch.py
+++ b/dlrm_data_pytorch.py
--- a/dlrm_s_caffe2.py
+++ b/dlrm_s_caffe2.py
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
--- a/extend_distributed.py
+++ b/extend_distributed.py
--- a/images/image1.png
+++ b/images/image1.png