Merge branch 'add_Recommendation' into 'main'

添加VAE-CF和dlrm See merge request dcutoolkit/deeplearing/dlexamples_new!24

Merge branch 'add_Recommendation' into 'main'
添加VAE-CF和dlrm See merge request dcutoolkit/deeplearing/dlexamples_new!24
c43a53e4 · sunxx1 · 5394b117 · 56225fdf · c43a53e4 · c43a53e4
Commit c43a53e4 authored Jul 14, 2022 by sunxx1
20 changed files
--- a/PyTorch/Recommendation/VAE-CF/vae/utils/round.py
+++ b/PyTorch/Recommendation/VAE-CF/vae/utils/round.py
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from functools import partial
+def round_n(x, n=8):
+    return n * int(np.ceil(x / n))
+round_8 = partial(round_n, n=8)
--- a/PyTorch/Recommendation/dlrm/.gitignore
+++ b/PyTorch/Recommendation/dlrm/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
--- a/PyTorch/Recommendation/dlrm/CODE_OF_CONDUCT.md
+++ b/PyTorch/Recommendation/dlrm/CODE_OF_CONDUCT.md
+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
--- a/PyTorch/Recommendation/dlrm/CONTRIBUTING.md
+++ b/PyTorch/Recommendation/dlrm/CONTRIBUTING.md
+# Contributing to DLRM
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* in general, please maintain a consistent style with the rest of the code
+## License
+By contributing to DLRM, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/PyTorch/Recommendation/dlrm/Dockerfile
+++ b/PyTorch/Recommendation/dlrm/Dockerfile
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
+FROM ${FROM_IMAGE_NAME}
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install torch==1.3.1
+WORKDIR /code
+ADD . .
--- a/PyTorch/Recommendation/dlrm/LICENSE
+++ b/PyTorch/Recommendation/dlrm/LICENSE
+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/PyTorch/Recommendation/dlrm/README.md
+++ b/PyTorch/Recommendation/dlrm/README.md
+Deep Learning Recommendation Model for Personalization and Recommendation Systems:
+=================================================================================
+## 模型结构
+```
+output:
+                    probability of a click
+model:                        |
+                             /\
+                            /__\
+                              |
+      _____________________> Op  <___________________
+    /                         |                      \
+   /\                        /\                      /\
+  /__\                      /__\           ...      /__\
+   |                          |                       |
+   |                         Op                      Op
+   |                    ____/__\_____           ____/__\____
+   |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+input:
+[ dense features ]     [sparse indices] , ..., [sparse indices]
+```
+ More precise definition of model layers:
+ 1) fully connected layers of an mlp
+    z = f(y)
+    y = Wx + b
+ 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+    z = Op(e1,...,ek)
+    obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+ 3) Operator Op can be one of the following
+    Sum(e1,...,ek) = e1 + ... + ek
+    Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+    Cat(e1,...,ek) = [e1', ..., ek']'
+    where ' denotes transpose operation
+测试用例执行
+--------------------
+1) 模型简单测试
+```
+$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
+time/loss/accuracy (if enabled):
+Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
+Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
+Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
+```
+2) debug模式（可以自行设置模型参数、规格）
+```
+$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
+model arch:
+mlp top arch 3 layers, with input to output dimensions:
+[8 4 2 1]
+# of interactions
+8
+mlp bot arch 2 layers, with input to output dimensions:
+[4 3 2]
+# of features (sparse and dense)
+4
+dense feature size
+4
+sparse feature size
+2
+# of embeddings (= # of sparse features) 3, with dimensions 2x:
+[4 3 2]
+data (inputs and targets):
+mini-batch: 0
+[[0.69647 0.28614 0.22685 0.55131]
+ [0.71947 0.42311 0.98076 0.68483]]
+[[[1], [0, 1]], [[0], [1]], [[1], [0]]]
+[[0.55679]
+ [0.15896]]
+mini-batch: 1
+[[0.36179 0.22826 0.29371 0.63098]
+ [0.0921  0.4337  0.43086 0.49369]]
+[[[1], [0, 2, 3]], [[1], [1, 2]], [[1], [1]]]
+[[0.15307]
+ [0.69553]]
+mini-batch: 2
+[[0.60306 0.54507 0.34276 0.30412]
+ [0.41702 0.6813  0.87546 0.51042]]
+[[[2], [0, 1, 2]], [[1], [2]], [[1], [1]]]
+[[0.31877]
+ [0.69197]]
+initial parameters (weights and bias):
+[[ 0.05438 -0.11105]
+ [ 0.42513  0.34167]
+ [-0.1426  -0.45641]
+ [-0.19523 -0.10181]]
+[[ 0.23667  0.57199]
+ [-0.16638  0.30316]
+ [ 0.10759  0.22136]]
+[[-0.49338 -0.14301]
+ [-0.36649 -0.22139]]
+[[0.51313 0.66662 0.10591 0.13089]
+ [0.32198 0.66156 0.84651 0.55326]
+ [0.85445 0.38484 0.31679 0.35426]]
+[0.17108 0.82911 0.33867]
+[[0.55237 0.57855 0.52153]
+ [0.00269 0.98835 0.90534]]
+[0.20764 0.29249]
+[[0.52001 0.90191 0.98363 0.25754 0.56436 0.80697 0.39437 0.73107]
+ [0.16107 0.6007  0.86586 0.98352 0.07937 0.42835 0.20454 0.45064]
+ [0.54776 0.09333 0.29686 0.92758 0.569   0.45741 0.75353 0.74186]
+ [0.04858 0.7087  0.83924 0.16594 0.781   0.28654 0.30647 0.66526]]
+[0.11139 0.66487 0.88786 0.69631]
+[[0.44033 0.43821 0.7651  0.56564]
+ [0.0849  0.58267 0.81484 0.33707]]
+[0.92758 0.75072]
+[[0.57406 0.75164]]
+[0.07915]
+DLRM_Net(
+  (emb_l): ModuleList(
+    (0): EmbeddingBag(4, 2, mode=sum)
+    (1): EmbeddingBag(3, 2, mode=sum)
+    (2): EmbeddingBag(2, 2, mode=sum)
+  )
+  (bot_l): Sequential(
+    (0): Linear(in_features=4, out_features=3, bias=True)
+    (1): ReLU()
+    (2): Linear(in_features=3, out_features=2, bias=True)
+    (3): ReLU()
+  )
+  (top_l): Sequential(
+    (0): Linear(in_features=8, out_features=4, bias=True)
+    (1): ReLU()
+    (2): Linear(in_features=4, out_features=2, bias=True)
+    (3): ReLU()
+    (4): Linear(in_features=2, out_features=1, bias=True)
+    (5): Sigmoid()
+  )
+)
+time/loss/accuracy (if enabled):
+Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
+Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
+Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
+updated parameters (weights and bias):
+[[ 0.0543  -0.1112 ]
+ [ 0.42513  0.34167]
+ [-0.14283 -0.45679]
+ [-0.19532 -0.10197]]
+[[ 0.23667  0.57199]
+ [-0.1666   0.30285]
+ [ 0.10751  0.22124]]
+[[-0.49338 -0.14301]
+ [-0.36664 -0.22164]]
+[[0.51313 0.66663 0.10591 0.1309 ]
+ [0.32196 0.66154 0.84649 0.55324]
+ [0.85444 0.38482 0.31677 0.35425]]
+[0.17109 0.82907 0.33863]
+[[0.55238 0.57857 0.52154]
+ [0.00265 0.98825 0.90528]]
+[0.20764 0.29244]
+[[0.51996 0.90184 0.98368 0.25752 0.56436 0.807   0.39437 0.73107]
+ [0.16096 0.60055 0.86596 0.98348 0.07938 0.42842 0.20453 0.45064]
+ [0.5476  0.0931  0.29701 0.92752 0.56902 0.45752 0.75351 0.74187]
+ [0.04849 0.70857 0.83933 0.1659  0.78101 0.2866  0.30646 0.66526]]
+[0.11137 0.66482 0.88778 0.69627]
+[[0.44029 0.43816 0.76502 0.56561]
+ [0.08485 0.5826  0.81474 0.33702]]
+[0.92754 0.75067]
+[[0.57379 0.7514 ]]
+[0.07908]
+```
+基准测试
+------------
+1) 使用随机生成数据测试
+    ```
+    ./bench/dlrm_s_benchmark.sh
+    ```
+2) 使用[Criteo Kaggle Display Advertising Challenge Dataset](https://ailab.criteo.com/ressources/) 数据测试方法.
+   - 下载并解压数据到/data/kaggle路径下
+      ```
+      mkdir -p /data/kaggle
+      tar xvf kaggle-display-advertising-challenge-dataset.tar.gz
+      ```
+   - 执行测试脚本
+     ```
+     ./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
+     ```
+   - 可以通过修改脚本中的以下参数来指定测试数据路径
+     - 首先可以指定训练数据地址 --raw-data-file=<path/train.txt>
+     - 可以指定预处理后的数据地址 --processed-data-file=<path/*.npz>
+训练结果参考如下   
+<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
+3) 多节点测试：代码支持分布式训练，目前支持gloo/nccl/mpi. 
+```
+# 单节点4颗DCU测试，使用nccl通信，测试数据使用随机生成数据:
+python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
+--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
+# 多节点的情况可以添加如下参数:
+--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
+```
+保存、加载模型参数
+-------------------------------
+* --save-model=<path/model.pt> ： 保存模型地址、名称
+* --load-model=<path/model.pt> ： 加载模型
+其他
+----
+想了解其他应用情况，可以参考地址：https://github.com/facebookresearch/dlrm
+Version
+-------
+0.1 : Initial release of the DLRM code
+1.0 : DLRM with distributed training, cpu support for row-wise adagrad optimizer
+Requirements
+------------
+pytorch (*11/10/20*)
+scikit-learn
+numpy
+onnx (*optional*)
+pydot (*optional*)
+torchviz (*optional*)
+mpi (*optional for distributed backend*)
--- a/PyTorch/Recommendation/dlrm/bench/dlrm_s_benchmark.sh
+++ b/PyTorch/Recommendation/dlrm/bench/dlrm_s_benchmark.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+cpu=1
+gpu=1
+pt=1
+c2=1
+ncores=28 #12 #6
+nsockets="0"
+ngpus="1 2 4 8"
+numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+dlrm_c2_bin="python dlrm_s_caffe2.py"
+data=random #synthetic
+print_freq=100
+rand_seed=727
+c2_net="async_scheduling"
+#Model param
+mb_size=2048 #1024 #512 #256
+nbatches=1000 #500 #100
+bot_mlp="512-512-64"
+top_mlp="1024-1024-1024-1"
+emb_size=64
+nindices=100
+emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
+interaction="dot"
+tnworkers=0
+tmb_size=16384
+#_args="--mini-batch-size="${mb_size}\
+_args=" --num-batches="${nbatches}\
+" --data-generation="${data}\
+" --arch-mlp-bot="${bot_mlp}\
+" --arch-mlp-top="${top_mlp}\
+" --arch-sparse-feature-size="${emb_size}\
+" --arch-embedding-size="${emb}\
+" --num-indices-per-lookup="${nindices}\
+" --arch-interaction-op="${interaction}\
+" --numpy-rand-seed="${rand_seed}\
+" --print-freq="${print_freq}\
+" --print-time"\
+" --enable-profiling "
+c2_args=" --caffe2-net-type="${c2_net}
+# CPU Benchmarking
+if [ $cpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "CPU Benchmarking - running on $ncores cores"
+  echo "--------------------------------------------"
+  if [ $pt = 1 ]; then
+    outf="model1_CPU_PT_$ncores.log"
+    outp="dlrm_s_pytorch.prof"
+    echo "-------------------------------"
+    echo "Running PT (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file(s)
+    mv $outp ${outf//".log"/".prof"}
+    mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+  fi
+  if [ $c2 = 1 ]; then
+    outf="model1_CPU_C2_$ncores.log"
+    outp="dlrm_s_caffe2.prof"
+    echo "-------------------------------"
+    echo "Running C2 (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file (collected from stderr above)
+    mv $outp ${outf//".log"/".prof"}
+  fi
+fi
+# GPU Benchmarking
+if [ $gpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "GPU Benchmarking - running on $ngpus GPUs"
+  echo "--------------------------------------------"
+  for _ng in $ngpus
+  do
+    # weak scaling
+    # _mb_size=$((mb_size*_ng))
+    # strong scaling
+    _mb_size=$((mb_size*1))
+    _gpus=$(seq -s, 0 $((_ng-1)))
+    cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
+    echo "-------------------"
+    echo "Using GPUS: "$_gpus
+    echo "-------------------"
+    if [ $pt = 1 ]; then
+      outf="model1_GPU_PT_$_ng.log"
+      outp="dlrm_s_pytorch.prof"
+      echo "-------------------------------"
+      echo "Running PT (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file(s)
+      mv $outp ${outf//".log"/".prof"}
+      mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+    fi
+    if [ $c2 = 1 ]; then
+      outf="model1_GPU_C2_$_ng.log"
+      outp="dlrm_s_caffe2.prof"
+      echo "-------------------------------"
+      echo "Running C2 (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file (collected from stderr above)
+      mv $outp ${outf//".log"/".prof"}
+    fi
+  done
+fi
--- a/PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_kaggle.sh
+++ b/PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_kaggle.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+#dlrm_c2_bin="python dlrm_s_caffe2.py"
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=/data/kaggle/train.txt --processed-data-file=/data/kaggle/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
+#echo "run caffe2 ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+#$dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
+echo "done"
--- a/PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_terabyte.sh
+++ b/PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_terabyte.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+dlrm_c2_bin="python dlrm_s_caffe2.py"
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
+echo "run caffe2 ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log
+echo "done"
--- a/PyTorch/Recommendation/dlrm/bench/run_and_time.sh
+++ b/PyTorch/Recommendation/dlrm/bench/run_and_time.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
+echo "done"
--- a/PyTorch/Recommendation/dlrm/cython/cython_compile.py
+++ b/PyTorch/Recommendation/dlrm/cython/cython_compile.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: compile .so from python code
+from __future__ import absolute_import, division, print_function, unicode_literals
+from setuptools import setup
+from Cython.Build import cythonize
+from distutils.extension import Extension
+ext_modules = [
+    Extension(
+        "data_utils_cython",
+        ["data_utils_cython.pyx"],
+        extra_compile_args=['-O3'],
+        extra_link_args=['-O3'],
+    )
+]
+setup(
+    name='data_utils_cython',
+    ext_modules=cythonize(ext_modules)
+)
--- a/PyTorch/Recommendation/dlrm/cython/cython_criteo.py
+++ b/PyTorch/Recommendation/dlrm/cython/cython_criteo.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: run dataset pre-processing in standalone mode
+# WARNING: These steps are required to work with Cython
+# 1. Instal Cython
+# > sudo yum install Cython
+# 2. Please copy data_utils.py into data_utils_cython.pyx
+# 3. Compile the data_utils_cython.pyx to generate .so
+# (it's important to keep extension .pyx rather than .py
+#  to ensure the C/C++ .so no .py is loaded at import time)
+# > python cython_compile.py build_ext --inplace
+# This should create data_utils_cython.so, which can be loaded below with "import"
+# 4. Run standalone datatset preprocessing to generate .npz files
+# a. Kaggle
+# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
+#   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
+# b. Terabyte
+# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
+#   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
+from __future__ import absolute_import, division, print_function, unicode_literals
+import data_utils_cython as duc
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Preprocess Criteo dataset"
+    )
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+    duc.loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map
+    )
--- a/PyTorch/Recommendation/dlrm/data_loader_terabyte.py
+++ b/PyTorch/Recommendation/dlrm/data_loader_terabyte.py
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import time
+import math
+from tqdm import tqdm
+import argparse
+class DataLoader:
+    """
+    DataLoader dedicated for the Criteo Terabyte Click Logs dataset
+    """
+    def __init__(
+            self,
+            data_filename,
+            data_directory,
+            days,
+            batch_size,
+            max_ind_range=-1,
+            split="train",
+            drop_last_batch=False
+    ):
+        self.data_filename = data_filename
+        self.data_directory = data_directory
+        self.days = days
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        total_file = os.path.join(
+            data_directory,
+            data_filename + "_day_count.npz"
+        )
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"][np.array(days)]
+        self.length = sum(total_per_file)
+        if split == "test" or split == "val":
+            self.length = int(np.ceil(self.length / 2.))
+        self.split = split
+        self.drop_last_batch = drop_last_batch
+    def __iter__(self):
+        return iter(
+            _batch_generator(
+                self.data_filename, self.data_directory, self.days,
+                self.batch_size, self.split, self.drop_last_batch, self.max_ind_range
+            )
+        )
+    def __len__(self):
+        if self.drop_last_batch:
+            return self.length // self.batch_size
+        else:
+            return math.ceil(self.length / self.batch_size)
+def _transform_features(
+        x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
+):
+    if max_ind_range > 0:
+        x_cat_batch = x_cat_batch % max_ind_range
+    if flag_input_torch_tensor:
+        x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
+        x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
+        y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
+    else:
+        x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
+        x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
+        y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
+    batch_size = x_cat_batch.shape[0]
+    feature_count = x_cat_batch.shape[1]
+    lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
+    return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
+def _batch_generator(
+        data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
+):
+    previous_file = None
+    for day in days:
+        filepath = os.path.join(
+            data_directory,
+            data_filename + "_{}_reordered.npz".format(day)
+        )
+        # print('Loading file: ', filepath)
+        with np.load(filepath) as data:
+            x_int = data["X_int"]
+            x_cat = data["X_cat"]
+            y = data["y"]
+        samples_in_file = y.shape[0]
+        batch_start_idx = 0
+        if split == "test" or split == "val":
+            length = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                samples_in_file = length
+            elif split == "val":
+                batch_start_idx = samples_in_file - length
+        while batch_start_idx < samples_in_file - batch_size:
+            missing_samples = batch_size
+            if previous_file is not None:
+                missing_samples -= previous_file['y'].shape[0]
+            current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
+            x_int_batch = x_int[current_slice]
+            x_cat_batch = x_cat[current_slice]
+            y_batch = y[current_slice]
+            if previous_file is not None:
+                x_int_batch = np.concatenate(
+                    [previous_file['x_int'], x_int_batch],
+                    axis=0
+                )
+                x_cat_batch = np.concatenate(
+                    [previous_file['x_cat'], x_cat_batch],
+                    axis=0
+                )
+                y_batch = np.concatenate([previous_file['y'], y_batch], axis=0)
+                previous_file = None
+            if x_int_batch.shape[0] != batch_size:
+                raise ValueError('should not happen')
+            yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
+            batch_start_idx += missing_samples
+        if batch_start_idx != samples_in_file:
+            current_slice = slice(batch_start_idx, samples_in_file)
+            if previous_file is not None:
+                previous_file = {
+                    'x_int' : np.concatenate(
+                        [previous_file['x_int'], x_int[current_slice]],
+                        axis=0
+                    ),
+                    'x_cat' : np.concatenate(
+                        [previous_file['x_cat'], x_cat[current_slice]],
+                        axis=0
+                    ),
+                    'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0)
+                }
+            else:
+                previous_file = {
+                    'x_int' : x_int[current_slice],
+                    'x_cat' : x_cat[current_slice],
+                    'y' : y[current_slice]
+                }
+    if not drop_last:
+        yield _transform_features(
+            previous_file['x_int'],
+            previous_file['x_cat'],
+            previous_file['y'],
+            max_ind_range
+        )
+def _test():
+    generator = _batch_generator(
+        data_filename='day',
+        data_directory='./input',
+        days=range(23),
+        split="train",
+        batch_size=2048,
+        drop_last=True,
+        max_ind_range=-1
+    )
+    t1 = time.time()
+    for x_int, lS_o, x_cat, y in generator:
+        t2 = time.time()
+        time_diff = t2 - t1
+        t1 = t2
+        print(
+            "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
+                time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
+            )
+        )
+class CriteoBinDataset(Dataset):
+    """Binary version of criteo dataset."""
+    def __init__(self, data_file, counts_file,
+                 batch_size=1, max_ind_range=-1, bytes_per_feature=4):
+        # dataset
+        self.tar_fea = 1   # single target
+        self.den_fea = 13  # 13 dense  features
+        self.spa_fea = 26  # 26 sparse features
+        self.tad_fea = self.tar_fea + self.den_fea
+        self.tot_fea = self.tad_fea + self.spa_fea
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+        print('data file:', data_file, 'number of batches:', self.num_entries)
+        self.file = open(data_file, 'rb')
+        with np.load(counts_file) as data:
+            self.counts = data["counts"]
+        # hardcoded for now
+        self.m_den = 13
+    def __len__(self):
+        return self.num_entries
+    def __getitem__(self, idx):
+        self.file.seek(idx * self.bytes_per_entry, 0)
+        raw_data = self.file.read(self.bytes_per_entry)
+        array = np.frombuffer(raw_data, dtype=np.int32)
+        tensor = torch.from_numpy(array).view((-1, self.tot_fea))
+        return _transform_features(x_int_batch=tensor[:, 1:14],
+                                   x_cat_batch=tensor[:, 14:],
+                                   y_batch=tensor[:, 0],
+                                   max_ind_range=self.max_ind_range,
+                                   flag_input_torch_tensor=True)
+    def __del__(self):
+        self.file.close()
+def numpy_to_binary(input_files, output_file_path, split='train'):
+    """Convert the data to a binary format to be read with CriteoBinDataset."""
+    # WARNING - both categorical and numerical data must fit into int32 for
+    # the following code to work correctly
+    with open(output_file_path, 'wb') as output_file:
+        if split == 'train':
+            for input_file in input_files:
+                print('Processing file: ', input_file)
+                np_data = np.load(input_file)
+                np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                          np_data['X_int'],
+                                          np_data['X_cat']], axis=1)
+                np_data = np_data.astype(np.int32)
+                output_file.write(np_data.tobytes())
+        else:
+            assert len(input_files) == 1
+            np_data = np.load(input_files[0])
+            np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                      np_data['X_int'],
+                                      np_data['X_cat']], axis=1)
+            np_data = np_data.astype(np.int32)
+            samples_in_file = np_data.shape[0]
+            midpoint = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                begin = 0
+                end = midpoint
+            elif split == "val":
+                begin = midpoint
+                end = samples_in_file
+            else:
+                raise ValueError('Unknown split value: ', split)
+            output_file.write(np_data[begin:end].tobytes())
+def _preprocess(args):
+    train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for
+                   day in range(0, 23)]
+    test_valid_file = args.input_data_prefix + '_23_reordered.npz'
+    os.makedirs(args.output_directory, exist_ok=True)
+    for split in ['train', 'val', 'test']:
+        print('Running preprocessing for split =', split)
+        output_file = os.path.join(args.output_directory,
+                                   '{}_data.bin'.format(split))
+        input_files = train_files if split == 'train' else [test_valid_file]
+        numpy_to_binary(input_files=input_files,
+                        output_file_path=output_file,
+                        split=split)
+def _test_bin():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_directory', required=True)
+    parser.add_argument('--input_data_prefix', required=True)
+    parser.add_argument('--split', choices=['train', 'test', 'val'],
+                        required=True)
+    args = parser.parse_args()
+    _preprocess(args)
+    binary_data_file = os.path.join(args.output_directory,
+                                    '{}_data.bin'.format(args.split))
+    counts_file = os.path.join(args.output_directory, 'day_fea_count.npz')
+    dataset_binary = CriteoBinDataset(data_file=binary_data_file,
+                                            counts_file=counts_file,
+                                            batch_size=2048,)
+    from dlrm_data_pytorch import CriteoDataset 
+    from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo
+    binary_loader = torch.utils.data.DataLoader(
+        dataset_binary,
+        batch_size=None,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=None,
+        pin_memory=False,
+        drop_last=False,
+    )
+    original_dataset = CriteoDataset(
+        dataset='terabyte',
+        max_ind_range=10 * 1000 * 1000,
+        sub_sample_rate=1,
+        randomize=True,
+        split=args.split,
+        raw_path=args.input_data_prefix,
+        pro_data='dummy_string',
+        memory_map=True
+    )
+    original_loader = torch.utils.data.DataLoader(
+        original_dataset,
+        batch_size=2048,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_wrapper_criteo,
+        pin_memory=False,
+        drop_last=False,
+    )
+    assert len(dataset_binary) == len(original_loader)
+    for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader,
+                                                        binary_loader)),
+                                          total=len(dataset_binary)):
+        for j in range(len(new_batch)):
+            if not np.array_equal(old_batch[j], new_batch[j]):
+                raise ValueError('FAILED: Datasets not equal')
+        if i > len(dataset_binary):
+            break
+    print('PASSED')
+if __name__ == '__main__':
+    _test()
+    _test_bin()
--- a/PyTorch/Recommendation/dlrm/data_utils.py
+++ b/PyTorch/Recommendation/dlrm/data_utils.py
--- a/PyTorch/Recommendation/dlrm/dlrm_data_caffe2.py
+++ b/PyTorch/Recommendation/dlrm/dlrm_data_caffe2.py
--- a/PyTorch/Recommendation/dlrm/dlrm_data_pytorch.py
+++ b/PyTorch/Recommendation/dlrm/dlrm_data_pytorch.py
--- a/PyTorch/Recommendation/dlrm/dlrm_s_caffe2.py
+++ b/PyTorch/Recommendation/dlrm/dlrm_s_caffe2.py
--- a/PyTorch/Recommendation/dlrm/dlrm_s_pytorch.py
+++ b/PyTorch/Recommendation/dlrm/dlrm_s_pytorch.py
--- a/PyTorch/Recommendation/dlrm/extend_distributed.py
+++ b/PyTorch/Recommendation/dlrm/extend_distributed.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import builtins
+import os
+import sys
+import torch
+import torch.distributed as dist
+from torch.autograd import Function
+from torch.autograd.profiler import record_function
+from torch.nn.parallel import DistributedDataParallel as DDP
+try:
+    import torch_ccl
+except ImportError as e:
+    # print(e)
+    torch_ccl = False
+try:
+    import torch_ucc
+except ImportError as e:
+    torch_ucc = False
+my_rank = -1
+my_size = -1
+my_local_rank = -1
+my_local_size = -1
+alltoall_supported = False
+a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "")
+myreq = None
+def env2int(env_list, default=-1):
+    for e in env_list:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+def get_my_slice(n):
+    k, m = divmod(n, my_size)
+    return slice(
+        my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1
+    )
+def get_split_lengths(n):
+    k, m = divmod(n, my_size)
+    if m == 0:
+        splits = None
+        my_len = k
+    else:
+        splits = [(k + 1) if i < m else k for i in range(my_size)]
+        my_len = splits[my_rank]
+    return (my_len, splits)
+def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
+    global myreq
+    global my_rank
+    global my_size
+    global my_local_rank
+    global my_local_size
+    global a2a_impl
+    global alltoall_supported
+    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
+    num_mpi_ranks = env2int(
+        ["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
+    )
+    if backend == "" and num_mpi_ranks > 1:
+        if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
+            backend = "ccl"
+        elif use_gpu and dist.is_nccl_available():
+            backend = "nccl"
+        elif dist.is_mpi_available():
+            backend = "mpi"
+        else:
+            print(
+                "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
+            )
+            backend = "gloo"
+    if backend != "":
+        # guess Rank and size
+        if rank == -1:
+            rank = env2int(
+                ["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
+            )
+        if size == -1:
+            size = env2int(
+                [
+                    "PMI_SIZE",
+                    "OMPI_COMM_WORLD_SIZE",
+                    "MV2_COMM_WORLD_SIZE",
+                    "WORLD_SIZE",
+                ],
+                1,
+            )
+        if not os.environ.get("RANK", None) and rank != -1:
+            os.environ["RANK"] = str(rank)
+        if not os.environ.get("WORLD_SIZE", None) and size != -1:
+            os.environ["WORLD_SIZE"] = str(size)
+        if not os.environ.get("MASTER_PORT", None):
+            os.environ["MASTER_PORT"] = "29500"
+        if not os.environ.get("MASTER_ADDR", None):
+            local_size = env2int(
+                [
+                    "MPI_LOCALNRANKS",
+                    "OMPI_COMM_WORLD_LOCAL_SIZE",
+                    "MV2_COMM_WORLD_LOCAL_SIZE",
+                ],
+                1,
+            )
+            if local_size != size and backend != "mpi":
+                print(
+                    "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
+                )
+                print(
+                    "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
+                )
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+    if size > 1:
+        if local_rank == -1:
+            my_local_rank = env2int(
+                [
+                    "MPI_LOCALRANKID",
+                    "OMPI_COMM_WORLD_LOCAL_RANK",
+                    "MV2_COMM_WORLD_LOCAL_RANK",
+                    "LOCAL_RANK",
+                ],
+                0,
+            )
+        else:
+            my_local_rank = local_rank
+        my_local_size = env2int(
+            [
+                "MPI_LOCALNRANKS",
+                "OMPI_COMM_WORLD_LOCAL_SIZE",
+                "MV2_COMM_WORLD_LOCAL_SIZE",
+            ],
+            1,
+        )
+        if use_gpu:
+            if my_local_size > torch.cuda.device_count():
+                print(
+                    "Not sufficient GPUs available... local_size = %d, ngpus = %d"
+                    % (my_local_size, torch.cuda.device_count())
+                )
+                sys.exit(1)
+            torch.cuda.set_device(my_local_rank)
+        dist.init_process_group(backend, rank=rank, world_size=size)
+        my_rank = dist.get_rank()
+        my_size = dist.get_world_size()
+        if my_rank == 0:
+            print("Running on %d ranks using %s backend" % (my_size, backend))
+        if hasattr(dist, "all_to_all_single"):
+            try:
+                t = torch.zeros([4])
+                if use_gpu:
+                    t = t.cuda()
+                dist.all_to_all_single(t, t)
+                alltoall_supported = True
+            except RuntimeError as err:
+                print("fail to enable all_to_all_single primitive: %s" % err)
+        if a2a_impl == "alltoall" and alltoall_supported == False:
+            print(
+                "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
+                % (a2a_impl, backend)
+            )
+            a2a_impl = "scatter"
+        if a2a_impl != "":
+            print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
+    else:
+        my_rank = 0
+        my_size = 1
+        my_local_rank = 0
+        my_local_size = 1
+    print_all(
+        "world size: %d, current rank: %d, local rank: %d"
+        % (my_size, my_rank, my_local_rank)
+    )
+    myreq = Request()
+class Request(object):
+    def __init__(self):
+        self.req = None
+        self.tensor = None
+        self.WaitFunction = All2All_Scatter_Wait
+    def wait(self):
+        ret = self.WaitFunction.apply(*self.tensor)
+        self.req = None
+        self.tensor = None
+        return ret
+class All2All_ScatterList_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            for j in range(table_split_lengths[i]):
+                out_tensor = inputs[0].new_empty(
+                    [a2a_info.local_batch_num, a2a_info.emb_dim]
+                )
+                scatter_list = (
+                    list(inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else []
+                )
+                req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
+                gather_list.append(out_tensor)
+                req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        return myreq.tensor
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_inputs = myreq.tensor
+        myreq.tensor = None
+        return (None, *grad_inputs)
+class All2All_ScatterList_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        a2a_info = ctx.a2a_info
+        grad_output = [t.contiguous() for t in grad_output]
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else [a2a_info.local_batch_num] * my_size
+        )
+        per_rank_table_splits = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_inputs = [
+            grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim])
+            for _ in range(a2a_info.local_table_num)
+        ]
+        req_list = []
+        ind = 0
+        for i in range(my_size):
+            for j in range(per_rank_table_splits[i]):
+                gather_list = (
+                    list(grad_inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else None
+                )
+                req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True)
+                req_list.append(req)
+                ind += 1
+        myreq.req = req_list
+        myreq.tensor = grad_inputs
+        return tuple(grad_output)
+class All2All_Scatter_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        input = torch.cat(inputs, dim=1)
+        scatter_list = list(input.split(batch_split_lengths, dim=0))
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            out_tensor = input.new_empty(
+                [a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim]
+            )
+            req = dist.scatter(
+                out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True
+            )
+            gather_list.append(out_tensor)
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        ctx.a2a_info = a2a_info
+        return myreq.tensor
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_input = myreq.tensor
+        grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1)
+        myreq.tensor = None
+        return (None, *grad_inputs)
+class All2All_Scatter_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        assert len(grad_output) == my_size
+        scatter_list = [t.contiguous() for t in grad_output]
+        a2a_info = ctx.a2a_info
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_input = grad_output[0].new_empty(
+            [a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num]
+        )
+        gather_list = list(grad_input.split(batch_split_lengths, dim=0))
+        req_list = []
+        for i in range(my_size):
+            req = dist.gather(
+                scatter_list[i],
+                gather_list if i == my_rank else [],
+                dst=i,
+                async_op=True,
+            )
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = grad_input
+        return grad_output
+class All2All_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        with record_function("DLRM alltoall_req_fwd_single"):
+            batch_split_lengths = a2a_info.global_batch_partition_slices
+            if batch_split_lengths:
+                batch_split_lengths = [
+                    m * a2a_info.emb_dim * a2a_info.local_table_num
+                    for m in batch_split_lengths
+                ]
+            table_split_lengths = a2a_info.global_table_wise_parition_slices
+            if table_split_lengths:
+                table_split_lengths = [
+                    a2a_info.local_batch_num * e * a2a_info.emb_dim
+                    for e in table_split_lengths
+                ]
+            input = torch.cat(inputs, dim=1).view([-1])
+            output = input.new_empty(
+                [
+                    a2a_info.global_table_num
+                    * a2a_info.local_batch_num
+                    * a2a_info.emb_dim
+                ]
+            )
+            req = dist.all_to_all_single(
+                output, input, table_split_lengths, batch_split_lengths, async_op=True
+            )
+            myreq.req = req
+            myreq.tensor = []
+            myreq.tensor.append(output)
+            myreq.tensor = tuple(myreq.tensor)
+            a2a_info.batch_split_lengths = batch_split_lengths
+            a2a_info.table_split_lengths = table_split_lengths
+            myreq.a2a_info = a2a_info
+            ctx.a2a_info = a2a_info
+            return myreq.tensor
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        with record_function("DLRM alltoall_req_bwd_single"):
+            a2a_info = ctx.a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            grad_input = myreq.tensor
+            grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split(
+                a2a_info.emb_dim, dim=1
+            )
+            grad_inputs = [gin.contiguous() for gin in grad_inputs]
+            myreq.tensor = None
+            return (None, *grad_inputs)
+class All2All_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        with record_function("DLRM alltoall_wait_fwd_single"):
+            a2a_info = myreq.a2a_info
+            ctx.a2a_info = a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            myreq.tensor = None
+            table_split_lengths = (
+                a2a_info.table_split_lengths
+                if a2a_info.table_split_lengths
+                else a2a_info.local_table_num
+                * a2a_info.local_batch_num
+                * a2a_info.emb_dim
+            )
+            outputs = output[0].split(table_split_lengths)
+            outputs = tuple(
+                [out.view([a2a_info.local_batch_num, -1]) for out in outputs]
+            )
+            return outputs
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        global myreq
+        with record_function("DLRM alltoall_wait_bwd_single"):
+            a2a_info = ctx.a2a_info
+            grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
+            grad_output = torch.cat(grad_outputs)
+            grad_input = grad_output.new_empty(
+                [a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim]
+            )
+            req = dist.all_to_all_single(
+                grad_input,
+                grad_output,
+                a2a_info.batch_split_lengths,
+                a2a_info.table_split_lengths,
+                async_op=True,
+            )
+            myreq.req = req
+            myreq.tensor = grad_input
+            return (grad_output,)
+class AllGather(Function):
+    @staticmethod
+    def forward(ctx, input, global_lengths, dim=0):
+        if not isinstance(global_lengths, (list, tuple)):
+            global_lengths = [global_lengths] * my_size
+        assert len(global_lengths) == my_size
+        assert global_lengths[my_rank] == input.size(dim)
+        local_start = sum(global_lengths[:my_rank])
+        output_size = list(input.size())
+        ctx.dim = dim
+        ctx.local_start = local_start
+        ctx.local_length = global_lengths[my_rank]
+        input = input.contiguous()
+        if dim == 0:
+            out_len = sum(global_lengths)
+            output_size[dim] = out_len
+            output = input.new_empty(output_size)
+            gather_list = list(output.split(global_lengths, dim=0))
+        else:
+            gather_list = [torch.empty_like(input) for _ in range(my_size)]
+            gather_list = []
+            for length in global_lengths:
+                output_size[dim] = length
+                gather_list.append(input.new_empty(output_size))
+        dist.all_gather(gather_list, input)
+        if dim != 0:
+            output = torch.cat(gather_list, dim=dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        # print("Inside All2AllBackward")
+        dim = ctx.dim
+        start = ctx.local_start
+        length = ctx.local_length
+        grad_input = grad_output.narrow(dim, start, length)
+        return (grad_input, None, None)
+class All2AllInfo(object):
+    pass
+def alltoall(inputs, per_rank_table_splits):
+    global myreq
+    batch_size, emb_dim = inputs[0].size()
+    a2a_info = All2AllInfo()
+    a2a_info.local_table_num = len(inputs)
+    a2a_info.global_table_wise_parition_slices = per_rank_table_splits
+    (
+        a2a_info.local_batch_num,
+        a2a_info.global_batch_partition_slices,
+    ) = get_split_lengths(batch_size)
+    a2a_info.emb_dim = emb_dim
+    a2a_info.batch_size = batch_size
+    a2a_info.global_table_num = (
+        sum(per_rank_table_splits)
+        if per_rank_table_splits
+        else a2a_info.local_table_num * my_size
+    )
+    if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall":
+        # print("Using All2All_Req")
+        output = All2All_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Wait
+    elif a2a_impl == "" or a2a_impl == "scatter":
+        # print("Using All2All_Scatter_Req")
+        output = All2All_Scatter_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Scatter_Wait
+    elif a2a_impl == "scatter_list":
+        # print("Using All2All_ScatterList_Req")
+        output = All2All_ScatterList_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_ScatterList_Wait
+    else:
+        print(
+            "Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
+            "please use one of [alltoall, scatter, scatter_list]" % a2a_impl
+        )
+    return myreq
+def all_gather(input, lengths, dim=0):
+    if not lengths:
+        lengths = [input.size(0)] * my_size
+    return AllGather.apply(input, lengths, dim)
+def barrier():
+    if my_size > 1:
+        dist.barrier()
+# Override builtin print function to print only from rank 0
+orig_print = builtins.print
+def rank0_print(*args, **kwargs):
+    if my_rank <= 0 or kwargs.get("print_all", False):
+        orig_print(*args, **kwargs)
+builtins.print = rank0_print
+# Allow printing from all rank with explicit print_all
+def print_all(*args, **kwargs):
+    orig_print(*args, **kwargs)