Commit c43a53e4 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'add_Recommendation' into 'main'

添加VAE-CF和dlrm

See merge request dcutoolkit/deeplearing/dlexamples_new!24
parents 5394b117 56225fdf
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from functools import partial
def round_n(x, n=8):
return n * int(np.ceil(x / n))
round_8 = partial(round_n, n=8)
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
Please read the [full text](https://code.fb.com/codeofconduct/)
so that you can understand what actions will and will not be tolerated.
# Contributing to DLRM
We want to make contributing to this project as easy and transparent as
possible.
## Pull Requests
We actively welcome your pull requests.
1. Fork the repo and create your branch from `main`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
* 4 spaces for indentation rather than tabs
* 80 character line length
* in general, please maintain a consistent style with the rest of the code
## License
By contributing to DLRM, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
FROM ${FROM_IMAGE_NAME}
ADD requirements.txt .
RUN pip install -r requirements.txt
RUN pip install torch==1.3.1
WORKDIR /code
ADD . .
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Deep Learning Recommendation Model for Personalization and Recommendation Systems:
=================================================================================
## 模型结构
```
output:
probability of a click
model: |
/\
/__\
|
_____________________> Op <___________________
/ | \
/\ /\ /\
/__\ /__\ ... /__\
| | |
| Op Op
| ____/__\_____ ____/__\____
| |_Emb_|____|__| ... |_Emb_|__|___|
input:
[ dense features ] [sparse indices] , ..., [sparse indices]
```
More precise definition of model layers:
1) fully connected layers of an mlp
z = f(y)
y = Wx + b
2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
z = Op(e1,...,ek)
obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
3) Operator Op can be one of the following
Sum(e1,...,ek) = e1 + ... + ek
Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
Cat(e1,...,ek) = [e1', ..., ek']'
where ' denotes transpose operation
测试用例执行
--------------------
1) 模型简单测试
```
$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
time/loss/accuracy (if enabled):
Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
```
2) debug模式(可以自行设置模型参数、规格)
```
$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
model arch:
mlp top arch 3 layers, with input to output dimensions:
[8 4 2 1]
# of interactions
8
mlp bot arch 2 layers, with input to output dimensions:
[4 3 2]
# of features (sparse and dense)
4
dense feature size
4
sparse feature size
2
# of embeddings (= # of sparse features) 3, with dimensions 2x:
[4 3 2]
data (inputs and targets):
mini-batch: 0
[[0.69647 0.28614 0.22685 0.55131]
[0.71947 0.42311 0.98076 0.68483]]
[[[1], [0, 1]], [[0], [1]], [[1], [0]]]
[[0.55679]
[0.15896]]
mini-batch: 1
[[0.36179 0.22826 0.29371 0.63098]
[0.0921 0.4337 0.43086 0.49369]]
[[[1], [0, 2, 3]], [[1], [1, 2]], [[1], [1]]]
[[0.15307]
[0.69553]]
mini-batch: 2
[[0.60306 0.54507 0.34276 0.30412]
[0.41702 0.6813 0.87546 0.51042]]
[[[2], [0, 1, 2]], [[1], [2]], [[1], [1]]]
[[0.31877]
[0.69197]]
initial parameters (weights and bias):
[[ 0.05438 -0.11105]
[ 0.42513 0.34167]
[-0.1426 -0.45641]
[-0.19523 -0.10181]]
[[ 0.23667 0.57199]
[-0.16638 0.30316]
[ 0.10759 0.22136]]
[[-0.49338 -0.14301]
[-0.36649 -0.22139]]
[[0.51313 0.66662 0.10591 0.13089]
[0.32198 0.66156 0.84651 0.55326]
[0.85445 0.38484 0.31679 0.35426]]
[0.17108 0.82911 0.33867]
[[0.55237 0.57855 0.52153]
[0.00269 0.98835 0.90534]]
[0.20764 0.29249]
[[0.52001 0.90191 0.98363 0.25754 0.56436 0.80697 0.39437 0.73107]
[0.16107 0.6007 0.86586 0.98352 0.07937 0.42835 0.20454 0.45064]
[0.54776 0.09333 0.29686 0.92758 0.569 0.45741 0.75353 0.74186]
[0.04858 0.7087 0.83924 0.16594 0.781 0.28654 0.30647 0.66526]]
[0.11139 0.66487 0.88786 0.69631]
[[0.44033 0.43821 0.7651 0.56564]
[0.0849 0.58267 0.81484 0.33707]]
[0.92758 0.75072]
[[0.57406 0.75164]]
[0.07915]
DLRM_Net(
(emb_l): ModuleList(
(0): EmbeddingBag(4, 2, mode=sum)
(1): EmbeddingBag(3, 2, mode=sum)
(2): EmbeddingBag(2, 2, mode=sum)
)
(bot_l): Sequential(
(0): Linear(in_features=4, out_features=3, bias=True)
(1): ReLU()
(2): Linear(in_features=3, out_features=2, bias=True)
(3): ReLU()
)
(top_l): Sequential(
(0): Linear(in_features=8, out_features=4, bias=True)
(1): ReLU()
(2): Linear(in_features=4, out_features=2, bias=True)
(3): ReLU()
(4): Linear(in_features=2, out_features=1, bias=True)
(5): Sigmoid()
)
)
time/loss/accuracy (if enabled):
Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
updated parameters (weights and bias):
[[ 0.0543 -0.1112 ]
[ 0.42513 0.34167]
[-0.14283 -0.45679]
[-0.19532 -0.10197]]
[[ 0.23667 0.57199]
[-0.1666 0.30285]
[ 0.10751 0.22124]]
[[-0.49338 -0.14301]
[-0.36664 -0.22164]]
[[0.51313 0.66663 0.10591 0.1309 ]
[0.32196 0.66154 0.84649 0.55324]
[0.85444 0.38482 0.31677 0.35425]]
[0.17109 0.82907 0.33863]
[[0.55238 0.57857 0.52154]
[0.00265 0.98825 0.90528]]
[0.20764 0.29244]
[[0.51996 0.90184 0.98368 0.25752 0.56436 0.807 0.39437 0.73107]
[0.16096 0.60055 0.86596 0.98348 0.07938 0.42842 0.20453 0.45064]
[0.5476 0.0931 0.29701 0.92752 0.56902 0.45752 0.75351 0.74187]
[0.04849 0.70857 0.83933 0.1659 0.78101 0.2866 0.30646 0.66526]]
[0.11137 0.66482 0.88778 0.69627]
[[0.44029 0.43816 0.76502 0.56561]
[0.08485 0.5826 0.81474 0.33702]]
[0.92754 0.75067]
[[0.57379 0.7514 ]]
[0.07908]
```
基准测试
------------
1) 使用随机生成数据测试
```
./bench/dlrm_s_benchmark.sh
```
2) 使用[Criteo Kaggle Display Advertising Challenge Dataset](https://ailab.criteo.com/ressources/) 数据测试方法.
- 下载并解压数据到/data/kaggle路径下
```
mkdir -p /data/kaggle
tar xvf kaggle-display-advertising-challenge-dataset.tar.gz
```
- 执行测试脚本
```
./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
```
- 可以通过修改脚本中的以下参数来指定测试数据路径
- 首先可以指定训练数据地址 --raw-data-file=<path/train.txt>
- 可以指定预处理后的数据地址 --processed-data-file=<path/*.npz>
训练结果参考如下
<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
3) 多节点测试:代码支持分布式训练,目前支持gloo/nccl/mpi.
```
# 单节点4颗DCU测试,使用nccl通信,测试数据使用随机生成数据:
python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
# 多节点的情况可以添加如下参数:
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
```
保存、加载模型参数
-------------------------------
* --save-model=<path/model.pt> : 保存模型地址、名称
* --load-model=<path/model.pt> : 加载模型
其他
----
想了解其他应用情况,可以参考地址:https://github.com/facebookresearch/dlrm
Version
-------
0.1 : Initial release of the DLRM code
1.0 : DLRM with distributed training, cpu support for row-wise adagrad optimizer
Requirements
------------
pytorch (*11/10/20*)
scikit-learn
numpy
onnx (*optional*)
pydot (*optional*)
torchviz (*optional*)
mpi (*optional for distributed backend*)
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
cpu=1
gpu=1
pt=1
c2=1
ncores=28 #12 #6
nsockets="0"
ngpus="1 2 4 8"
numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
dlrm_pt_bin="python dlrm_s_pytorch.py"
dlrm_c2_bin="python dlrm_s_caffe2.py"
data=random #synthetic
print_freq=100
rand_seed=727
c2_net="async_scheduling"
#Model param
mb_size=2048 #1024 #512 #256
nbatches=1000 #500 #100
bot_mlp="512-512-64"
top_mlp="1024-1024-1024-1"
emb_size=64
nindices=100
emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
interaction="dot"
tnworkers=0
tmb_size=16384
#_args="--mini-batch-size="${mb_size}\
_args=" --num-batches="${nbatches}\
" --data-generation="${data}\
" --arch-mlp-bot="${bot_mlp}\
" --arch-mlp-top="${top_mlp}\
" --arch-sparse-feature-size="${emb_size}\
" --arch-embedding-size="${emb}\
" --num-indices-per-lookup="${nindices}\
" --arch-interaction-op="${interaction}\
" --numpy-rand-seed="${rand_seed}\
" --print-freq="${print_freq}\
" --print-time"\
" --enable-profiling "
c2_args=" --caffe2-net-type="${c2_net}
# CPU Benchmarking
if [ $cpu = 1 ]; then
echo "--------------------------------------------"
echo "CPU Benchmarking - running on $ncores cores"
echo "--------------------------------------------"
if [ $pt = 1 ]; then
outf="model1_CPU_PT_$ncores.log"
outp="dlrm_s_pytorch.prof"
echo "-------------------------------"
echo "Running PT (log file: $outf)"
echo "-------------------------------"
cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file(s)
mv $outp ${outf//".log"/".prof"}
mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
fi
if [ $c2 = 1 ]; then
outf="model1_CPU_C2_$ncores.log"
outp="dlrm_s_caffe2.prof"
echo "-------------------------------"
echo "Running C2 (log file: $outf)"
echo "-------------------------------"
cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file (collected from stderr above)
mv $outp ${outf//".log"/".prof"}
fi
fi
# GPU Benchmarking
if [ $gpu = 1 ]; then
echo "--------------------------------------------"
echo "GPU Benchmarking - running on $ngpus GPUs"
echo "--------------------------------------------"
for _ng in $ngpus
do
# weak scaling
# _mb_size=$((mb_size*_ng))
# strong scaling
_mb_size=$((mb_size*1))
_gpus=$(seq -s, 0 $((_ng-1)))
cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
echo "-------------------"
echo "Using GPUS: "$_gpus
echo "-------------------"
if [ $pt = 1 ]; then
outf="model1_GPU_PT_$_ng.log"
outp="dlrm_s_pytorch.prof"
echo "-------------------------------"
echo "Running PT (log file: $outf)"
echo "-------------------------------"
cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file(s)
mv $outp ${outf//".log"/".prof"}
mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
fi
if [ $c2 = 1 ]; then
outf="model1_GPU_C2_$_ng.log"
outp="dlrm_s_caffe2.prof"
echo "-------------------------------"
echo "Running C2 (log file: $outf)"
echo "-------------------------------"
cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file (collected from stderr above)
mv $outp ${outf//".log"/".prof"}
fi
done
fi
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_pt_bin="python dlrm_s_pytorch.py"
#dlrm_c2_bin="python dlrm_s_caffe2.py"
echo "run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=/data/kaggle/train.txt --processed-data-file=/data/kaggle/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
#echo "run caffe2 ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
#$dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
echo "done"
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_pt_bin="python dlrm_s_pytorch.py"
dlrm_c2_bin="python dlrm_s_caffe2.py"
echo "run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
echo "run caffe2 ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log
echo "done"
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
echo "done"
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Description: compile .so from python code
from __future__ import absolute_import, division, print_function, unicode_literals
from setuptools import setup
from Cython.Build import cythonize
from distutils.extension import Extension
ext_modules = [
Extension(
"data_utils_cython",
["data_utils_cython.pyx"],
extra_compile_args=['-O3'],
extra_link_args=['-O3'],
)
]
setup(
name='data_utils_cython',
ext_modules=cythonize(ext_modules)
)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Description: run dataset pre-processing in standalone mode
# WARNING: These steps are required to work with Cython
# 1. Instal Cython
# > sudo yum install Cython
# 2. Please copy data_utils.py into data_utils_cython.pyx
# 3. Compile the data_utils_cython.pyx to generate .so
# (it's important to keep extension .pyx rather than .py
# to ensure the C/C++ .so no .py is loaded at import time)
# > python cython_compile.py build_ext --inplace
# This should create data_utils_cython.so, which can be loaded below with "import"
# 4. Run standalone datatset preprocessing to generate .npz files
# a. Kaggle
# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
# --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
# b. Terabyte
# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
# --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
from __future__ import absolute_import, division, print_function, unicode_literals
import data_utils_cython as duc
if __name__ == "__main__":
### import packages ###
import argparse
### parse arguments ###
parser = argparse.ArgumentParser(
description="Preprocess Criteo dataset"
)
# model related parameters
parser.add_argument("--max-ind-range", type=int, default=-1)
parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
parser.add_argument("--data-randomize", type=str, default="total") # or day or none
parser.add_argument("--memory-map", action="store_true", default=False)
parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
parser.add_argument("--raw-data-file", type=str, default="")
parser.add_argument("--processed-data-file", type=str, default="")
args = parser.parse_args()
duc.loadDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map
)
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import numpy as np
from torch.utils.data import Dataset
import torch
import time
import math
from tqdm import tqdm
import argparse
class DataLoader:
"""
DataLoader dedicated for the Criteo Terabyte Click Logs dataset
"""
def __init__(
self,
data_filename,
data_directory,
days,
batch_size,
max_ind_range=-1,
split="train",
drop_last_batch=False
):
self.data_filename = data_filename
self.data_directory = data_directory
self.days = days
self.batch_size = batch_size
self.max_ind_range = max_ind_range
total_file = os.path.join(
data_directory,
data_filename + "_day_count.npz"
)
with np.load(total_file) as data:
total_per_file = data["total_per_file"][np.array(days)]
self.length = sum(total_per_file)
if split == "test" or split == "val":
self.length = int(np.ceil(self.length / 2.))
self.split = split
self.drop_last_batch = drop_last_batch
def __iter__(self):
return iter(
_batch_generator(
self.data_filename, self.data_directory, self.days,
self.batch_size, self.split, self.drop_last_batch, self.max_ind_range
)
)
def __len__(self):
if self.drop_last_batch:
return self.length // self.batch_size
else:
return math.ceil(self.length / self.batch_size)
def _transform_features(
x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
):
if max_ind_range > 0:
x_cat_batch = x_cat_batch % max_ind_range
if flag_input_torch_tensor:
x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
else:
x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
batch_size = x_cat_batch.shape[0]
feature_count = x_cat_batch.shape[1]
lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
def _batch_generator(
data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
):
previous_file = None
for day in days:
filepath = os.path.join(
data_directory,
data_filename + "_{}_reordered.npz".format(day)
)
# print('Loading file: ', filepath)
with np.load(filepath) as data:
x_int = data["X_int"]
x_cat = data["X_cat"]
y = data["y"]
samples_in_file = y.shape[0]
batch_start_idx = 0
if split == "test" or split == "val":
length = int(np.ceil(samples_in_file / 2.))
if split == "test":
samples_in_file = length
elif split == "val":
batch_start_idx = samples_in_file - length
while batch_start_idx < samples_in_file - batch_size:
missing_samples = batch_size
if previous_file is not None:
missing_samples -= previous_file['y'].shape[0]
current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
x_int_batch = x_int[current_slice]
x_cat_batch = x_cat[current_slice]
y_batch = y[current_slice]
if previous_file is not None:
x_int_batch = np.concatenate(
[previous_file['x_int'], x_int_batch],
axis=0
)
x_cat_batch = np.concatenate(
[previous_file['x_cat'], x_cat_batch],
axis=0
)
y_batch = np.concatenate([previous_file['y'], y_batch], axis=0)
previous_file = None
if x_int_batch.shape[0] != batch_size:
raise ValueError('should not happen')
yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
batch_start_idx += missing_samples
if batch_start_idx != samples_in_file:
current_slice = slice(batch_start_idx, samples_in_file)
if previous_file is not None:
previous_file = {
'x_int' : np.concatenate(
[previous_file['x_int'], x_int[current_slice]],
axis=0
),
'x_cat' : np.concatenate(
[previous_file['x_cat'], x_cat[current_slice]],
axis=0
),
'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0)
}
else:
previous_file = {
'x_int' : x_int[current_slice],
'x_cat' : x_cat[current_slice],
'y' : y[current_slice]
}
if not drop_last:
yield _transform_features(
previous_file['x_int'],
previous_file['x_cat'],
previous_file['y'],
max_ind_range
)
def _test():
generator = _batch_generator(
data_filename='day',
data_directory='./input',
days=range(23),
split="train",
batch_size=2048,
drop_last=True,
max_ind_range=-1
)
t1 = time.time()
for x_int, lS_o, x_cat, y in generator:
t2 = time.time()
time_diff = t2 - t1
t1 = t2
print(
"time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
)
)
class CriteoBinDataset(Dataset):
"""Binary version of criteo dataset."""
def __init__(self, data_file, counts_file,
batch_size=1, max_ind_range=-1, bytes_per_feature=4):
# dataset
self.tar_fea = 1 # single target
self.den_fea = 13 # 13 dense features
self.spa_fea = 26 # 26 sparse features
self.tad_fea = self.tar_fea + self.den_fea
self.tot_fea = self.tad_fea + self.spa_fea
self.batch_size = batch_size
self.max_ind_range = max_ind_range
self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
print('data file:', data_file, 'number of batches:', self.num_entries)
self.file = open(data_file, 'rb')
with np.load(counts_file) as data:
self.counts = data["counts"]
# hardcoded for now
self.m_den = 13
def __len__(self):
return self.num_entries
def __getitem__(self, idx):
self.file.seek(idx * self.bytes_per_entry, 0)
raw_data = self.file.read(self.bytes_per_entry)
array = np.frombuffer(raw_data, dtype=np.int32)
tensor = torch.from_numpy(array).view((-1, self.tot_fea))
return _transform_features(x_int_batch=tensor[:, 1:14],
x_cat_batch=tensor[:, 14:],
y_batch=tensor[:, 0],
max_ind_range=self.max_ind_range,
flag_input_torch_tensor=True)
def __del__(self):
self.file.close()
def numpy_to_binary(input_files, output_file_path, split='train'):
"""Convert the data to a binary format to be read with CriteoBinDataset."""
# WARNING - both categorical and numerical data must fit into int32 for
# the following code to work correctly
with open(output_file_path, 'wb') as output_file:
if split == 'train':
for input_file in input_files:
print('Processing file: ', input_file)
np_data = np.load(input_file)
np_data = np.concatenate([np_data['y'].reshape(-1, 1),
np_data['X_int'],
np_data['X_cat']], axis=1)
np_data = np_data.astype(np.int32)
output_file.write(np_data.tobytes())
else:
assert len(input_files) == 1
np_data = np.load(input_files[0])
np_data = np.concatenate([np_data['y'].reshape(-1, 1),
np_data['X_int'],
np_data['X_cat']], axis=1)
np_data = np_data.astype(np.int32)
samples_in_file = np_data.shape[0]
midpoint = int(np.ceil(samples_in_file / 2.))
if split == "test":
begin = 0
end = midpoint
elif split == "val":
begin = midpoint
end = samples_in_file
else:
raise ValueError('Unknown split value: ', split)
output_file.write(np_data[begin:end].tobytes())
def _preprocess(args):
train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for
day in range(0, 23)]
test_valid_file = args.input_data_prefix + '_23_reordered.npz'
os.makedirs(args.output_directory, exist_ok=True)
for split in ['train', 'val', 'test']:
print('Running preprocessing for split =', split)
output_file = os.path.join(args.output_directory,
'{}_data.bin'.format(split))
input_files = train_files if split == 'train' else [test_valid_file]
numpy_to_binary(input_files=input_files,
output_file_path=output_file,
split=split)
def _test_bin():
parser = argparse.ArgumentParser()
parser.add_argument('--output_directory', required=True)
parser.add_argument('--input_data_prefix', required=True)
parser.add_argument('--split', choices=['train', 'test', 'val'],
required=True)
args = parser.parse_args()
_preprocess(args)
binary_data_file = os.path.join(args.output_directory,
'{}_data.bin'.format(args.split))
counts_file = os.path.join(args.output_directory, 'day_fea_count.npz')
dataset_binary = CriteoBinDataset(data_file=binary_data_file,
counts_file=counts_file,
batch_size=2048,)
from dlrm_data_pytorch import CriteoDataset
from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo
binary_loader = torch.utils.data.DataLoader(
dataset_binary,
batch_size=None,
shuffle=False,
num_workers=0,
collate_fn=None,
pin_memory=False,
drop_last=False,
)
original_dataset = CriteoDataset(
dataset='terabyte',
max_ind_range=10 * 1000 * 1000,
sub_sample_rate=1,
randomize=True,
split=args.split,
raw_path=args.input_data_prefix,
pro_data='dummy_string',
memory_map=True
)
original_loader = torch.utils.data.DataLoader(
original_dataset,
batch_size=2048,
shuffle=False,
num_workers=0,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False,
)
assert len(dataset_binary) == len(original_loader)
for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader,
binary_loader)),
total=len(dataset_binary)):
for j in range(len(new_batch)):
if not np.array_equal(old_batch[j], new_batch[j]):
raise ValueError('FAILED: Datasets not equal')
if i > len(dataset_binary):
break
print('PASSED')
if __name__ == '__main__':
_test()
_test_bin()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
import builtins
import os
import sys
import torch
import torch.distributed as dist
from torch.autograd import Function
from torch.autograd.profiler import record_function
from torch.nn.parallel import DistributedDataParallel as DDP
try:
import torch_ccl
except ImportError as e:
# print(e)
torch_ccl = False
try:
import torch_ucc
except ImportError as e:
torch_ucc = False
my_rank = -1
my_size = -1
my_local_rank = -1
my_local_size = -1
alltoall_supported = False
a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "")
myreq = None
def env2int(env_list, default=-1):
for e in env_list:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default
def get_my_slice(n):
k, m = divmod(n, my_size)
return slice(
my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1
)
def get_split_lengths(n):
k, m = divmod(n, my_size)
if m == 0:
splits = None
my_len = k
else:
splits = [(k + 1) if i < m else k for i in range(my_size)]
my_len = splits[my_rank]
return (my_len, splits)
def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
global myreq
global my_rank
global my_size
global my_local_rank
global my_local_size
global a2a_impl
global alltoall_supported
# guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
num_mpi_ranks = env2int(
["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
)
if backend == "" and num_mpi_ranks > 1:
if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
backend = "ccl"
elif use_gpu and dist.is_nccl_available():
backend = "nccl"
elif dist.is_mpi_available():
backend = "mpi"
else:
print(
"WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
)
backend = "gloo"
if backend != "":
# guess Rank and size
if rank == -1:
rank = env2int(
["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
)
if size == -1:
size = env2int(
[
"PMI_SIZE",
"OMPI_COMM_WORLD_SIZE",
"MV2_COMM_WORLD_SIZE",
"WORLD_SIZE",
],
1,
)
if not os.environ.get("RANK", None) and rank != -1:
os.environ["RANK"] = str(rank)
if not os.environ.get("WORLD_SIZE", None) and size != -1:
os.environ["WORLD_SIZE"] = str(size)
if not os.environ.get("MASTER_PORT", None):
os.environ["MASTER_PORT"] = "29500"
if not os.environ.get("MASTER_ADDR", None):
local_size = env2int(
[
"MPI_LOCALNRANKS",
"OMPI_COMM_WORLD_LOCAL_SIZE",
"MV2_COMM_WORLD_LOCAL_SIZE",
],
1,
)
if local_size != size and backend != "mpi":
print(
"Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
)
print(
"If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
)
os.environ["MASTER_ADDR"] = "127.0.0.1"
if size > 1:
if local_rank == -1:
my_local_rank = env2int(
[
"MPI_LOCALRANKID",
"OMPI_COMM_WORLD_LOCAL_RANK",
"MV2_COMM_WORLD_LOCAL_RANK",
"LOCAL_RANK",
],
0,
)
else:
my_local_rank = local_rank
my_local_size = env2int(
[
"MPI_LOCALNRANKS",
"OMPI_COMM_WORLD_LOCAL_SIZE",
"MV2_COMM_WORLD_LOCAL_SIZE",
],
1,
)
if use_gpu:
if my_local_size > torch.cuda.device_count():
print(
"Not sufficient GPUs available... local_size = %d, ngpus = %d"
% (my_local_size, torch.cuda.device_count())
)
sys.exit(1)
torch.cuda.set_device(my_local_rank)
dist.init_process_group(backend, rank=rank, world_size=size)
my_rank = dist.get_rank()
my_size = dist.get_world_size()
if my_rank == 0:
print("Running on %d ranks using %s backend" % (my_size, backend))
if hasattr(dist, "all_to_all_single"):
try:
t = torch.zeros([4])
if use_gpu:
t = t.cuda()
dist.all_to_all_single(t, t)
alltoall_supported = True
except RuntimeError as err:
print("fail to enable all_to_all_single primitive: %s" % err)
if a2a_impl == "alltoall" and alltoall_supported == False:
print(
"Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
% (a2a_impl, backend)
)
a2a_impl = "scatter"
if a2a_impl != "":
print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
else:
my_rank = 0
my_size = 1
my_local_rank = 0
my_local_size = 1
print_all(
"world size: %d, current rank: %d, local rank: %d"
% (my_size, my_rank, my_local_rank)
)
myreq = Request()
class Request(object):
def __init__(self):
self.req = None
self.tensor = None
self.WaitFunction = All2All_Scatter_Wait
def wait(self):
ret = self.WaitFunction.apply(*self.tensor)
self.req = None
self.tensor = None
return ret
class All2All_ScatterList_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
gather_list = []
req_list = []
for i in range(my_size):
for j in range(table_split_lengths[i]):
out_tensor = inputs[0].new_empty(
[a2a_info.local_batch_num, a2a_info.emb_dim]
)
scatter_list = (
list(inputs[j].split(batch_split_lengths, dim=0))
if i == my_rank
else []
)
req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
gather_list.append(out_tensor)
req_list.append(req)
myreq.req = req_list
myreq.tensor = tuple(gather_list)
myreq.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
for r in myreq.req:
r.wait()
myreq.req = None
grad_inputs = myreq.tensor
myreq.tensor = None
return (None, *grad_inputs)
class All2All_ScatterList_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
ctx.a2a_info = myreq.a2a_info
for r in myreq.req:
r.wait()
myreq.req = None
myreq.tensor = None
return output
@staticmethod
def backward(ctx, *grad_output):
global myreq
a2a_info = ctx.a2a_info
grad_output = [t.contiguous() for t in grad_output]
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else [a2a_info.local_batch_num] * my_size
)
per_rank_table_splits = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
grad_inputs = [
grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim])
for _ in range(a2a_info.local_table_num)
]
req_list = []
ind = 0
for i in range(my_size):
for j in range(per_rank_table_splits[i]):
gather_list = (
list(grad_inputs[j].split(batch_split_lengths, dim=0))
if i == my_rank
else None
)
req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True)
req_list.append(req)
ind += 1
myreq.req = req_list
myreq.tensor = grad_inputs
return tuple(grad_output)
class All2All_Scatter_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
input = torch.cat(inputs, dim=1)
scatter_list = list(input.split(batch_split_lengths, dim=0))
gather_list = []
req_list = []
for i in range(my_size):
out_tensor = input.new_empty(
[a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim]
)
req = dist.scatter(
out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True
)
gather_list.append(out_tensor)
req_list.append(req)
myreq.req = req_list
myreq.tensor = tuple(gather_list)
myreq.a2a_info = a2a_info
ctx.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
for r in myreq.req:
r.wait()
myreq.req = None
grad_input = myreq.tensor
grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1)
myreq.tensor = None
return (None, *grad_inputs)
class All2All_Scatter_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
ctx.a2a_info = myreq.a2a_info
for r in myreq.req:
r.wait()
myreq.req = None
myreq.tensor = None
return output
@staticmethod
def backward(ctx, *grad_output):
global myreq
assert len(grad_output) == my_size
scatter_list = [t.contiguous() for t in grad_output]
a2a_info = ctx.a2a_info
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
grad_input = grad_output[0].new_empty(
[a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num]
)
gather_list = list(grad_input.split(batch_split_lengths, dim=0))
req_list = []
for i in range(my_size):
req = dist.gather(
scatter_list[i],
gather_list if i == my_rank else [],
dst=i,
async_op=True,
)
req_list.append(req)
myreq.req = req_list
myreq.tensor = grad_input
return grad_output
class All2All_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
with record_function("DLRM alltoall_req_fwd_single"):
batch_split_lengths = a2a_info.global_batch_partition_slices
if batch_split_lengths:
batch_split_lengths = [
m * a2a_info.emb_dim * a2a_info.local_table_num
for m in batch_split_lengths
]
table_split_lengths = a2a_info.global_table_wise_parition_slices
if table_split_lengths:
table_split_lengths = [
a2a_info.local_batch_num * e * a2a_info.emb_dim
for e in table_split_lengths
]
input = torch.cat(inputs, dim=1).view([-1])
output = input.new_empty(
[
a2a_info.global_table_num
* a2a_info.local_batch_num
* a2a_info.emb_dim
]
)
req = dist.all_to_all_single(
output, input, table_split_lengths, batch_split_lengths, async_op=True
)
myreq.req = req
myreq.tensor = []
myreq.tensor.append(output)
myreq.tensor = tuple(myreq.tensor)
a2a_info.batch_split_lengths = batch_split_lengths
a2a_info.table_split_lengths = table_split_lengths
myreq.a2a_info = a2a_info
ctx.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
with record_function("DLRM alltoall_req_bwd_single"):
a2a_info = ctx.a2a_info
myreq.req.wait()
myreq.req = None
grad_input = myreq.tensor
grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split(
a2a_info.emb_dim, dim=1
)
grad_inputs = [gin.contiguous() for gin in grad_inputs]
myreq.tensor = None
return (None, *grad_inputs)
class All2All_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
with record_function("DLRM alltoall_wait_fwd_single"):
a2a_info = myreq.a2a_info
ctx.a2a_info = a2a_info
myreq.req.wait()
myreq.req = None
myreq.tensor = None
table_split_lengths = (
a2a_info.table_split_lengths
if a2a_info.table_split_lengths
else a2a_info.local_table_num
* a2a_info.local_batch_num
* a2a_info.emb_dim
)
outputs = output[0].split(table_split_lengths)
outputs = tuple(
[out.view([a2a_info.local_batch_num, -1]) for out in outputs]
)
return outputs
@staticmethod
def backward(ctx, *grad_outputs):
global myreq
with record_function("DLRM alltoall_wait_bwd_single"):
a2a_info = ctx.a2a_info
grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
grad_output = torch.cat(grad_outputs)
grad_input = grad_output.new_empty(
[a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim]
)
req = dist.all_to_all_single(
grad_input,
grad_output,
a2a_info.batch_split_lengths,
a2a_info.table_split_lengths,
async_op=True,
)
myreq.req = req
myreq.tensor = grad_input
return (grad_output,)
class AllGather(Function):
@staticmethod
def forward(ctx, input, global_lengths, dim=0):
if not isinstance(global_lengths, (list, tuple)):
global_lengths = [global_lengths] * my_size
assert len(global_lengths) == my_size
assert global_lengths[my_rank] == input.size(dim)
local_start = sum(global_lengths[:my_rank])
output_size = list(input.size())
ctx.dim = dim
ctx.local_start = local_start
ctx.local_length = global_lengths[my_rank]
input = input.contiguous()
if dim == 0:
out_len = sum(global_lengths)
output_size[dim] = out_len
output = input.new_empty(output_size)
gather_list = list(output.split(global_lengths, dim=0))
else:
gather_list = [torch.empty_like(input) for _ in range(my_size)]
gather_list = []
for length in global_lengths:
output_size[dim] = length
gather_list.append(input.new_empty(output_size))
dist.all_gather(gather_list, input)
if dim != 0:
output = torch.cat(gather_list, dim=dim)
return output
@staticmethod
def backward(ctx, grad_output):
# print("Inside All2AllBackward")
dim = ctx.dim
start = ctx.local_start
length = ctx.local_length
grad_input = grad_output.narrow(dim, start, length)
return (grad_input, None, None)
class All2AllInfo(object):
pass
def alltoall(inputs, per_rank_table_splits):
global myreq
batch_size, emb_dim = inputs[0].size()
a2a_info = All2AllInfo()
a2a_info.local_table_num = len(inputs)
a2a_info.global_table_wise_parition_slices = per_rank_table_splits
(
a2a_info.local_batch_num,
a2a_info.global_batch_partition_slices,
) = get_split_lengths(batch_size)
a2a_info.emb_dim = emb_dim
a2a_info.batch_size = batch_size
a2a_info.global_table_num = (
sum(per_rank_table_splits)
if per_rank_table_splits
else a2a_info.local_table_num * my_size
)
if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall":
# print("Using All2All_Req")
output = All2All_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_Wait
elif a2a_impl == "" or a2a_impl == "scatter":
# print("Using All2All_Scatter_Req")
output = All2All_Scatter_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_Scatter_Wait
elif a2a_impl == "scatter_list":
# print("Using All2All_ScatterList_Req")
output = All2All_ScatterList_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_ScatterList_Wait
else:
print(
"Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
"please use one of [alltoall, scatter, scatter_list]" % a2a_impl
)
return myreq
def all_gather(input, lengths, dim=0):
if not lengths:
lengths = [input.size(0)] * my_size
return AllGather.apply(input, lengths, dim)
def barrier():
if my_size > 1:
dist.barrier()
# Override builtin print function to print only from rank 0
orig_print = builtins.print
def rank0_print(*args, **kwargs):
if my_rank <= 0 or kwargs.get("print_all", False):
orig_print(*args, **kwargs)
builtins.print = rank0_print
# Allow printing from all rank with explicit print_all
def print_all(*args, **kwargs):
orig_print(*args, **kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment