Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
56225fdf
Commit
56225fdf
authored
Jul 14, 2022
by
unknown
Browse files
添加VAE-CF和dlrm
parent
5394b117
Changes
57
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
8769 additions
and
0 deletions
+8769
-0
PyTorch/Recommendation/VAE-CF/vae/utils/round.py
PyTorch/Recommendation/VAE-CF/vae/utils/round.py
+22
-0
PyTorch/Recommendation/dlrm/.gitignore
PyTorch/Recommendation/dlrm/.gitignore
+138
-0
PyTorch/Recommendation/dlrm/CODE_OF_CONDUCT.md
PyTorch/Recommendation/dlrm/CODE_OF_CONDUCT.md
+5
-0
PyTorch/Recommendation/dlrm/CONTRIBUTING.md
PyTorch/Recommendation/dlrm/CONTRIBUTING.md
+36
-0
PyTorch/Recommendation/dlrm/Dockerfile
PyTorch/Recommendation/dlrm/Dockerfile
+15
-0
PyTorch/Recommendation/dlrm/LICENSE
PyTorch/Recommendation/dlrm/LICENSE
+21
-0
PyTorch/Recommendation/dlrm/README.md
PyTorch/Recommendation/dlrm/README.md
+244
-0
PyTorch/Recommendation/dlrm/bench/dlrm_s_benchmark.sh
PyTorch/Recommendation/dlrm/bench/dlrm_s_benchmark.sh
+147
-0
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_kaggle.sh
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_kaggle.sh
+32
-0
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_terabyte.sh
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_terabyte.sh
+32
-0
PyTorch/Recommendation/dlrm/bench/run_and_time.sh
PyTorch/Recommendation/dlrm/bench/run_and_time.sh
+19
-0
PyTorch/Recommendation/dlrm/cython/cython_compile.py
PyTorch/Recommendation/dlrm/cython/cython_compile.py
+26
-0
PyTorch/Recommendation/dlrm/cython/cython_criteo.py
PyTorch/Recommendation/dlrm/cython/cython_criteo.py
+55
-0
PyTorch/Recommendation/dlrm/data_loader_terabyte.py
PyTorch/Recommendation/dlrm/data_loader_terabyte.py
+368
-0
PyTorch/Recommendation/dlrm/data_utils.py
PyTorch/Recommendation/dlrm/data_utils.py
+1292
-0
PyTorch/Recommendation/dlrm/dlrm_data_caffe2.py
PyTorch/Recommendation/dlrm/dlrm_data_caffe2.py
+843
-0
PyTorch/Recommendation/dlrm/dlrm_data_pytorch.py
PyTorch/Recommendation/dlrm/dlrm_data_pytorch.py
+1291
-0
PyTorch/Recommendation/dlrm/dlrm_s_caffe2.py
PyTorch/Recommendation/dlrm/dlrm_s_caffe2.py
+1703
-0
PyTorch/Recommendation/dlrm/dlrm_s_pytorch.py
PyTorch/Recommendation/dlrm/dlrm_s_pytorch.py
+1877
-0
PyTorch/Recommendation/dlrm/extend_distributed.py
PyTorch/Recommendation/dlrm/extend_distributed.py
+603
-0
No files found.
PyTorch/Recommendation/VAE-CF/vae/utils/round.py
0 → 100644
View file @
56225fdf
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
functools
import
partial
def
round_n
(
x
,
n
=
8
):
return
n
*
int
(
np
.
ceil
(
x
/
n
))
round_8
=
partial
(
round_n
,
n
=
8
)
PyTorch/Recommendation/dlrm/.gitignore
0 → 100644
View file @
56225fdf
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
PyTorch/Recommendation/dlrm/CODE_OF_CONDUCT.md
0 → 100644
View file @
56225fdf
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
Please read the
[
full text
](
https://code.fb.com/codeofconduct/
)
so that you can understand what actions will and will not be tolerated.
PyTorch/Recommendation/dlrm/CONTRIBUTING.md
0 → 100644
View file @
56225fdf
# Contributing to DLRM
We want to make contributing to this project as easy and transparent as
possible.
## Pull Requests
We actively welcome your pull requests.
1.
Fork the repo and create your branch from
`main`
.
2.
If you've added code that should be tested, add tests.
3.
If you've changed APIs, update the documentation.
4.
Ensure the test suite passes.
5.
Make sure your code lints.
6.
If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here:
<https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Facebook has a
[
bounty program
](
https://www.facebook.com/whitehat/
)
for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
*
4 spaces for indentation rather than tabs
*
80 character line length
*
in general, please maintain a consistent style with the rest of the code
## License
By contributing to DLRM, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.
PyTorch/Recommendation/dlrm/Dockerfile
0 → 100644
View file @
56225fdf
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
ARG
FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
FROM
${FROM_IMAGE_NAME}
ADD
requirements.txt .
RUN
pip
install
-r
requirements.txt
RUN
pip
install
torch
==
1.3.1
WORKDIR
/code
ADD
. .
PyTorch/Recommendation/dlrm/LICENSE
0 → 100644
View file @
56225fdf
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
PyTorch/Recommendation/dlrm/README.md
0 → 100644
View file @
56225fdf
Deep Learning Recommendation Model for Personalization and Recommendation Systems:
=================================================================================
## 模型结构
```
output:
probability of a click
model: |
/\
/__\
|
_____________________> Op <___________________
/ | \
/\ /\ /\
/__\ /__\ ... /__\
| | |
| Op Op
| ____/__\_____ ____/__\____
| |_Emb_|____|__| ... |_Emb_|__|___|
input:
[ dense features ] [sparse indices] , ..., [sparse indices]
```
More precise definition of model layers:
1) fully connected layers of an mlp
z = f(y)
y = Wx + b
2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
z = Op(e1,...,ek)
obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
3) Operator Op can be one of the following
Sum(e1,...,ek) = e1 + ... + ek
Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
Cat(e1,...,ek) = [e1', ..., ek']'
where ' denotes transpose operation
测试用例执行
--------------------
1) 模型简单测试
```
$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
time/loss/accuracy (if enabled):
Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
```
2) debug模式(可以自行设置模型参数、规格)
```
$
python
dlrm_s_pytorch
.
py
--
mini
-
batch
-
size
=
2
--
data
-
size
=
6
--
debug
-
mode
model
arch
:
mlp
top
arch
3
layers
,
with
input
to
output
dimensions
:
[
8
4
2
1
]
#
of
interactions
8
mlp
bot
arch
2
layers
,
with
input
to
output
dimensions
:
[
4
3
2
]
#
of
features
(
sparse
and
dense
)
4
dense
feature
size
4
sparse
feature
size
2
#
of
embeddings
(=
#
of
sparse
features
)
3
,
with
dimensions
2
x
:
[
4
3
2
]
data
(
inputs
and
targets
):
mini
-
batch
:
0
[[
0.69647
0.28614
0.22685
0.55131
]
[
0.71947
0.42311
0.98076
0.68483
]]
[[[
1
],
[
0
,
1
]],
[[
0
],
[
1
]],
[[
1
],
[
0
]]]
[[
0.55679
]
[
0.15896
]]
mini
-
batch
:
1
[[
0.36179
0.22826
0.29371
0.63098
]
[
0.0921
0.4337
0.43086
0.49369
]]
[[[
1
],
[
0
,
2
,
3
]],
[[
1
],
[
1
,
2
]],
[[
1
],
[
1
]]]
[[
0.15307
]
[
0.69553
]]
mini
-
batch
:
2
[[
0.60306
0.54507
0.34276
0.30412
]
[
0.41702
0.6813
0.87546
0.51042
]]
[[[
2
],
[
0
,
1
,
2
]],
[[
1
],
[
2
]],
[[
1
],
[
1
]]]
[[
0.31877
]
[
0.69197
]]
initial
parameters
(
weights
and
bias
):
[[
0.05438
-
0.11105
]
[
0.42513
0.34167
]
[-
0.1426
-
0.45641
]
[-
0.19523
-
0.10181
]]
[[
0.23667
0.57199
]
[-
0.16638
0.30316
]
[
0.10759
0.22136
]]
[[-
0.49338
-
0.14301
]
[-
0.36649
-
0.22139
]]
[[
0.51313
0.66662
0.10591
0.13089
]
[
0.32198
0.66156
0.84651
0.55326
]
[
0.85445
0.38484
0.31679
0.35426
]]
[
0.17108
0.82911
0.33867
]
[[
0.55237
0.57855
0.52153
]
[
0.00269
0.98835
0.90534
]]
[
0.20764
0.29249
]
[[
0.52001
0.90191
0.98363
0.25754
0.56436
0.80697
0.39437
0.73107
]
[
0.16107
0.6007
0.86586
0.98352
0.07937
0.42835
0.20454
0.45064
]
[
0.54776
0.09333
0.29686
0.92758
0.569
0.45741
0.75353
0.74186
]
[
0.04858
0.7087
0.83924
0.16594
0.781
0.28654
0.30647
0.66526
]]
[
0.11139
0.66487
0.88786
0.69631
]
[[
0.44033
0.43821
0.7651
0.56564
]
[
0.0849
0.58267
0.81484
0.33707
]]
[
0.92758
0.75072
]
[[
0.57406
0.75164
]]
[
0.07915
]
DLRM_Net
(
(
emb_l
):
ModuleList
(
(
0
):
EmbeddingBag
(
4
,
2
,
mode
=
sum
)
(
1
):
EmbeddingBag
(
3
,
2
,
mode
=
sum
)
(
2
):
EmbeddingBag
(
2
,
2
,
mode
=
sum
)
)
(
bot_l
):
Sequential
(
(
0
):
Linear
(
in_features
=
4
,
out_features
=
3
,
bias
=
True
)
(
1
):
ReLU
()
(
2
):
Linear
(
in_features
=
3
,
out_features
=
2
,
bias
=
True
)
(
3
):
ReLU
()
)
(
top_l
):
Sequential
(
(
0
):
Linear
(
in_features
=
8
,
out_features
=
4
,
bias
=
True
)
(
1
):
ReLU
()
(
2
):
Linear
(
in_features
=
4
,
out_features
=
2
,
bias
=
True
)
(
3
):
ReLU
()
(
4
):
Linear
(
in_features
=
2
,
out_features
=
1
,
bias
=
True
)
(
5
):
Sigmoid
()
)
)
time
/
loss
/
accuracy
(
if
enabled
):
Finished
training
it
1
/
3
of
epoch
0
,
-
1.00
ms
/
it
,
loss
0.451893
,
accuracy
0.000
%
Finished
training
it
2
/
3
of
epoch
0
,
-
1.00
ms
/
it
,
loss
0.402002
,
accuracy
0.000
%
Finished
training
it
3
/
3
of
epoch
0
,
-
1.00
ms
/
it
,
loss
0.275460
,
accuracy
0.000
%
updated
parameters
(
weights
and
bias
):
[[
0.0543
-
0.1112
]
[
0.42513
0.34167
]
[-
0.14283
-
0.45679
]
[-
0.19532
-
0.10197
]]
[[
0.23667
0.57199
]
[-
0.1666
0.30285
]
[
0.10751
0.22124
]]
[[-
0.49338
-
0.14301
]
[-
0.36664
-
0.22164
]]
[[
0.51313
0.66663
0.10591
0.1309
]
[
0.32196
0.66154
0.84649
0.55324
]
[
0.85444
0.38482
0.31677
0.35425
]]
[
0.17109
0.82907
0.33863
]
[[
0.55238
0.57857
0.52154
]
[
0.00265
0.98825
0.90528
]]
[
0.20764
0.29244
]
[[
0.51996
0.90184
0.98368
0.25752
0.56436
0.807
0.39437
0.73107
]
[
0.16096
0.60055
0.86596
0.98348
0.07938
0.42842
0.20453
0.45064
]
[
0.5476
0.0931
0.29701
0.92752
0.56902
0.45752
0.75351
0.74187
]
[
0.04849
0.70857
0.83933
0.1659
0.78101
0.2866
0.30646
0.66526
]]
[
0.11137
0.66482
0.88778
0.69627
]
[[
0.44029
0.43816
0.76502
0.56561
]
[
0.08485
0.5826
0.81474
0.33702
]]
[
0.92754
0.75067
]
[[
0.57379
0.7514
]]
[
0.07908
]
```
基准测试
------------
1) 使用随机生成数据测试
```
./bench/dlrm_s_benchmark.sh
```
2) 使用
[
Criteo Kaggle Display Advertising Challenge Dataset
](
https://ailab.criteo.com/ressources/
)
数据测试方法.
-
下载并解压数据到/data/kaggle路径下
```
mkdir -p /data/kaggle
tar xvf kaggle-display-advertising-challenge-dataset.tar.gz
```
-
执行测试脚本
```
./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
```
-
可以通过修改脚本中的以下参数来指定测试数据路径
-
首先可以指定训练数据地址 --raw-data-file=
<path
/
train.txt
>
-
可以指定预处理后的数据地址 --processed-data-file=
<path
/
*.npz
>
训练结果参考如下
<img
src=
"./kaggle_dac_loss_accuracy_plots.png"
width=
"900"
height=
"320"
>
3) 多节点测试:代码支持分布式训练,目前支持gloo/nccl/mpi.
```
# 单节点4颗DCU测试,使用nccl通信,测试数据使用随机生成数据:
python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
# 多节点的情况可以添加如下参数:
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
```
保存、加载模型参数
-------------------------------
*
--save-model=
<path
/
model.pt
>
: 保存模型地址、名称
*
--load-model=
<path
/
model.pt
>
: 加载模型
其他
----
想了解其他应用情况,可以参考地址:https://github.com/facebookresearch/dlrm
Version
-------
0.
1 : Initial release of the DLRM code
1.
0 : DLRM with distributed training, cpu support for row-wise adagrad optimizer
Requirements
------------
pytorch (
*11/10/20*
)
scikit-learn
numpy
onnx (
*optional*
)
pydot (
*optional*
)
torchviz (
*optional*
)
mpi (
*optional for distributed backend*
)
PyTorch/Recommendation/dlrm/bench/dlrm_s_benchmark.sh
0 → 100644
View file @
56225fdf
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#check if extra argument is passed to the test
if
[[
$#
==
1
]]
;
then
dlrm_extra_option
=
$1
else
dlrm_extra_option
=
""
fi
#echo $dlrm_extra_option
cpu
=
1
gpu
=
1
pt
=
1
c2
=
1
ncores
=
28
#12 #6
nsockets
=
"0"
ngpus
=
"1 2 4 8"
numa_cmd
=
"numactl --physcpubind=0-
$((
ncores-1
))
-m
$nsockets
"
#run on one socket, without HT
dlrm_pt_bin
=
"python dlrm_s_pytorch.py"
dlrm_c2_bin
=
"python dlrm_s_caffe2.py"
data
=
random
#synthetic
print_freq
=
100
rand_seed
=
727
c2_net
=
"async_scheduling"
#Model param
mb_size
=
2048
#1024 #512 #256
nbatches
=
1000
#500 #100
bot_mlp
=
"512-512-64"
top_mlp
=
"1024-1024-1024-1"
emb_size
=
64
nindices
=
100
emb
=
"1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
interaction
=
"dot"
tnworkers
=
0
tmb_size
=
16384
#_args="--mini-batch-size="${mb_size}\
_args
=
" --num-batches="
${
nbatches
}
\
" --data-generation="
${
data
}
\
" --arch-mlp-bot="
${
bot_mlp
}
\
" --arch-mlp-top="
${
top_mlp
}
\
" --arch-sparse-feature-size="
${
emb_size
}
\
" --arch-embedding-size="
${
emb
}
\
" --num-indices-per-lookup="
${
nindices
}
\
" --arch-interaction-op="
${
interaction
}
\
" --numpy-rand-seed="
${
rand_seed
}
\
" --print-freq="
${
print_freq
}
\
" --print-time"
\
" --enable-profiling "
c2_args
=
" --caffe2-net-type="
${
c2_net
}
# CPU Benchmarking
if
[
$cpu
=
1
]
;
then
echo
"--------------------------------------------"
echo
"CPU Benchmarking - running on
$ncores
cores"
echo
"--------------------------------------------"
if
[
$pt
=
1
]
;
then
outf
=
"model1_CPU_PT_
$ncores
.log"
outp
=
"dlrm_s_pytorch.prof"
echo
"-------------------------------"
echo
"Running PT (log file:
$outf
)"
echo
"-------------------------------"
cmd
=
"
$numa_cmd
$dlrm_pt_bin
--mini-batch-size=
$mb_size
--test-mini-batch-size=
$tmb_size
--test-num-workers=
$tnworkers
$_args
$dlrm_extra_option
>
$outf
"
echo
$cmd
eval
$cmd
min
=
$(
grep
"iteration"
$outf
|
awk
'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}'
)
echo
"Min time per iteration =
$min
"
# move profiling file(s)
mv
$outp
${
outf
//
".log"
/
".prof"
}
mv
${
outp
//
".prof"
/
".json"
}
${
outf
//
".log"
/
".json"
}
fi
if
[
$c2
=
1
]
;
then
outf
=
"model1_CPU_C2_
$ncores
.log"
outp
=
"dlrm_s_caffe2.prof"
echo
"-------------------------------"
echo
"Running C2 (log file:
$outf
)"
echo
"-------------------------------"
cmd
=
"
$numa_cmd
$dlrm_c2_bin
--mini-batch-size=
$mb_size
$_args
$c2_args
$dlrm_extra_option
1>
$outf
2>
$outp
"
echo
$cmd
eval
$cmd
min
=
$(
grep
"iteration"
$outf
|
awk
'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}'
)
echo
"Min time per iteration =
$min
"
# move profiling file (collected from stderr above)
mv
$outp
${
outf
//
".log"
/
".prof"
}
fi
fi
# GPU Benchmarking
if
[
$gpu
=
1
]
;
then
echo
"--------------------------------------------"
echo
"GPU Benchmarking - running on
$ngpus
GPUs"
echo
"--------------------------------------------"
for
_ng
in
$ngpus
do
# weak scaling
# _mb_size=$((mb_size*_ng))
# strong scaling
_mb_size
=
$((
mb_size
*
1
))
_gpus
=
$(
seq
-s
, 0
$((
_ng-1
))
)
cuda_arg
=
"CUDA_VISIBLE_DEVICES=
$_gpus
"
echo
"-------------------"
echo
"Using GPUS: "
$_gpus
echo
"-------------------"
if
[
$pt
=
1
]
;
then
outf
=
"model1_GPU_PT_
$_ng
.log"
outp
=
"dlrm_s_pytorch.prof"
echo
"-------------------------------"
echo
"Running PT (log file:
$outf
)"
echo
"-------------------------------"
cmd
=
"
$cuda_arg
$dlrm_pt_bin
--mini-batch-size=
$_mb_size
--test-mini-batch-size=
$tmb_size
--test-num-workers=
$tnworkers
$_args
--use-gpu
$dlrm_extra_option
>
$outf
"
echo
$cmd
eval
$cmd
min
=
$(
grep
"iteration"
$outf
|
awk
'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}'
)
echo
"Min time per iteration =
$min
"
# move profiling file(s)
mv
$outp
${
outf
//
".log"
/
".prof"
}
mv
${
outp
//
".prof"
/
".json"
}
${
outf
//
".log"
/
".json"
}
fi
if
[
$c2
=
1
]
;
then
outf
=
"model1_GPU_C2_
$_ng
.log"
outp
=
"dlrm_s_caffe2.prof"
echo
"-------------------------------"
echo
"Running C2 (log file:
$outf
)"
echo
"-------------------------------"
cmd
=
"
$cuda_arg
$dlrm_c2_bin
--mini-batch-size=
$_mb_size
$_args
$c2_args
--use-gpu
$dlrm_extra_option
1>
$outf
2>
$outp
"
echo
$cmd
eval
$cmd
min
=
$(
grep
"iteration"
$outf
|
awk
'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}'
)
echo
"Min time per iteration =
$min
"
# move profiling file (collected from stderr above)
mv
$outp
${
outf
//
".log"
/
".prof"
}
fi
done
fi
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_kaggle.sh
0 → 100644
View file @
56225fdf
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if
[[
$#
==
1
]]
;
then
dlrm_extra_option
=
$1
else
dlrm_extra_option
=
""
fi
#echo $dlrm_extra_option
dlrm_pt_bin
=
"python dlrm_s_pytorch.py"
#dlrm_c2_bin="python dlrm_s_caffe2.py"
echo
"run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin
--arch-sparse-feature-size
=
16
--arch-mlp-bot
=
"13-512-256-64-16"
--arch-mlp-top
=
"512-256-1"
--data-generation
=
dataset
--data-set
=
kaggle
--raw-data-file
=
/data/kaggle/train.txt
--processed-data-file
=
/data/kaggle/kaggleAdDisplayChallenge_processed.npz
--loss-function
=
bce
--round-targets
=
True
--learning-rate
=
0.1
--mini-batch-size
=
128
--print-freq
=
1024
--print-time
--test-mini-batch-size
=
16384
--test-num-workers
=
16
$dlrm_extra_option
2>&1 |
tee
run_kaggle_pt.log
#echo "run caffe2 ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
#$dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
echo
"done"
PyTorch/Recommendation/dlrm/bench/dlrm_s_criteo_terabyte.sh
0 → 100644
View file @
56225fdf
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if
[[
$#
==
1
]]
;
then
dlrm_extra_option
=
$1
else
dlrm_extra_option
=
""
fi
#echo $dlrm_extra_option
dlrm_pt_bin
=
"python dlrm_s_pytorch.py"
dlrm_c2_bin
=
"python dlrm_s_caffe2.py"
echo
"run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin
--arch-sparse-feature-size
=
64
--arch-mlp-bot
=
"13-512-256-64"
--arch-mlp-top
=
"512-512-256-1"
--max-ind-range
=
10000000
--data-generation
=
dataset
--data-set
=
terabyte
--raw-data-file
=
./input/day
--processed-data-file
=
./input/terabyte_processed.npz
--loss-function
=
bce
--round-targets
=
True
--learning-rate
=
0.1
--mini-batch-size
=
2048
--print-freq
=
1024
--print-time
--test-mini-batch-size
=
16384
--test-num-workers
=
16
$dlrm_extra_option
2>&1 |
tee
run_terabyte_pt.log
echo
"run caffe2 ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_c2_bin
--arch-sparse-feature-size
=
64
--arch-mlp-bot
=
"13-512-256-64"
--arch-mlp-top
=
"512-512-256-1"
--max-ind-range
=
10000000
--data-generation
=
dataset
--data-set
=
terabyte
--raw-data-file
=
./input/day
--processed-data-file
=
./input/terabyte_processed.npz
--loss-function
=
bce
--round-targets
=
True
--learning-rate
=
0.1
--mini-batch-size
=
2048
--print-freq
=
1024
--print-time
$dlrm_extra_option
2>&1 |
tee
run_terabyte_c2.log
echo
"done"
PyTorch/Recommendation/dlrm/bench/run_and_time.sh
0 → 100644
View file @
56225fdf
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if
[[
$#
==
1
]]
;
then
dlrm_extra_option
=
$1
else
dlrm_extra_option
=
""
fi
#echo $dlrm_extra_option
python dlrm_s_pytorch.py
--arch-sparse-feature-size
=
128
--arch-mlp-bot
=
"13-512-256-128"
--arch-mlp-top
=
"1024-1024-512-256-1"
--max-ind-range
=
40000000
--data-generation
=
dataset
--data-set
=
terabyte
--raw-data-file
=
./input/day
--processed-data-file
=
./input/terabyte_processed.npz
--loss-function
=
bce
--round-targets
=
True
--learning-rate
=
1.0
--mini-batch-size
=
2048
--print-freq
=
2048
--print-time
--test-freq
=
102400
--test-mini-batch-size
=
16384
--test-num-workers
=
16
--memory-map
--mlperf-logging
--mlperf-auc-threshold
=
0.8025
--mlperf-bin-loader
--mlperf-bin-shuffle
$dlrm_extra_option
2>&1 |
tee
run_terabyte_mlperf_pt.log
echo
"done"
PyTorch/Recommendation/dlrm/cython/cython_compile.py
0 → 100644
View file @
56225fdf
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Description: compile .so from python code
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
setuptools
import
setup
from
Cython.Build
import
cythonize
from
distutils.extension
import
Extension
ext_modules
=
[
Extension
(
"data_utils_cython"
,
[
"data_utils_cython.pyx"
],
extra_compile_args
=
[
'-O3'
],
extra_link_args
=
[
'-O3'
],
)
]
setup
(
name
=
'data_utils_cython'
,
ext_modules
=
cythonize
(
ext_modules
)
)
PyTorch/Recommendation/dlrm/cython/cython_criteo.py
0 → 100644
View file @
56225fdf
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Description: run dataset pre-processing in standalone mode
# WARNING: These steps are required to work with Cython
# 1. Instal Cython
# > sudo yum install Cython
# 2. Please copy data_utils.py into data_utils_cython.pyx
# 3. Compile the data_utils_cython.pyx to generate .so
# (it's important to keep extension .pyx rather than .py
# to ensure the C/C++ .so no .py is loaded at import time)
# > python cython_compile.py build_ext --inplace
# This should create data_utils_cython.so, which can be loaded below with "import"
# 4. Run standalone datatset preprocessing to generate .npz files
# a. Kaggle
# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
# --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
# b. Terabyte
# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
# --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
data_utils_cython
as
duc
if
__name__
==
"__main__"
:
### import packages ###
import
argparse
### parse arguments ###
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess Criteo dataset"
)
# model related parameters
parser
.
add_argument
(
"--max-ind-range"
,
type
=
int
,
default
=-
1
)
parser
.
add_argument
(
"--data-sub-sample-rate"
,
type
=
float
,
default
=
0.0
)
# in [0, 1]
parser
.
add_argument
(
"--data-randomize"
,
type
=
str
,
default
=
"total"
)
# or day or none
parser
.
add_argument
(
"--memory-map"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--data-set"
,
type
=
str
,
default
=
"kaggle"
)
# or terabyte
parser
.
add_argument
(
"--raw-data-file"
,
type
=
str
,
default
=
""
)
parser
.
add_argument
(
"--processed-data-file"
,
type
=
str
,
default
=
""
)
args
=
parser
.
parse_args
()
duc
.
loadDataset
(
args
.
data_set
,
args
.
max_ind_range
,
args
.
data_sub_sample_rate
,
args
.
data_randomize
,
"train"
,
args
.
raw_data_file
,
args
.
processed_data_file
,
args
.
memory_map
)
PyTorch/Recommendation/dlrm/data_loader_terabyte.py
0 → 100644
View file @
56225fdf
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
numpy
as
np
from
torch.utils.data
import
Dataset
import
torch
import
time
import
math
from
tqdm
import
tqdm
import
argparse
class
DataLoader
:
"""
DataLoader dedicated for the Criteo Terabyte Click Logs dataset
"""
def
__init__
(
self
,
data_filename
,
data_directory
,
days
,
batch_size
,
max_ind_range
=-
1
,
split
=
"train"
,
drop_last_batch
=
False
):
self
.
data_filename
=
data_filename
self
.
data_directory
=
data_directory
self
.
days
=
days
self
.
batch_size
=
batch_size
self
.
max_ind_range
=
max_ind_range
total_file
=
os
.
path
.
join
(
data_directory
,
data_filename
+
"_day_count.npz"
)
with
np
.
load
(
total_file
)
as
data
:
total_per_file
=
data
[
"total_per_file"
][
np
.
array
(
days
)]
self
.
length
=
sum
(
total_per_file
)
if
split
==
"test"
or
split
==
"val"
:
self
.
length
=
int
(
np
.
ceil
(
self
.
length
/
2.
))
self
.
split
=
split
self
.
drop_last_batch
=
drop_last_batch
def
__iter__
(
self
):
return
iter
(
_batch_generator
(
self
.
data_filename
,
self
.
data_directory
,
self
.
days
,
self
.
batch_size
,
self
.
split
,
self
.
drop_last_batch
,
self
.
max_ind_range
)
)
def
__len__
(
self
):
if
self
.
drop_last_batch
:
return
self
.
length
//
self
.
batch_size
else
:
return
math
.
ceil
(
self
.
length
/
self
.
batch_size
)
def
_transform_features
(
x_int_batch
,
x_cat_batch
,
y_batch
,
max_ind_range
,
flag_input_torch_tensor
=
False
):
if
max_ind_range
>
0
:
x_cat_batch
=
x_cat_batch
%
max_ind_range
if
flag_input_torch_tensor
:
x_int_batch
=
torch
.
log
(
x_int_batch
.
clone
().
detach
().
type
(
torch
.
float
)
+
1
)
x_cat_batch
=
x_cat_batch
.
clone
().
detach
().
type
(
torch
.
long
)
y_batch
=
y_batch
.
clone
().
detach
().
type
(
torch
.
float32
).
view
(
-
1
,
1
)
else
:
x_int_batch
=
torch
.
log
(
torch
.
tensor
(
x_int_batch
,
dtype
=
torch
.
float
)
+
1
)
x_cat_batch
=
torch
.
tensor
(
x_cat_batch
,
dtype
=
torch
.
long
)
y_batch
=
torch
.
tensor
(
y_batch
,
dtype
=
torch
.
float32
).
view
(
-
1
,
1
)
batch_size
=
x_cat_batch
.
shape
[
0
]
feature_count
=
x_cat_batch
.
shape
[
1
]
lS_o
=
torch
.
arange
(
batch_size
).
reshape
(
1
,
-
1
).
repeat
(
feature_count
,
1
)
return
x_int_batch
,
lS_o
,
x_cat_batch
.
t
(),
y_batch
.
view
(
-
1
,
1
)
def
_batch_generator
(
data_filename
,
data_directory
,
days
,
batch_size
,
split
,
drop_last
,
max_ind_range
):
previous_file
=
None
for
day
in
days
:
filepath
=
os
.
path
.
join
(
data_directory
,
data_filename
+
"_{}_reordered.npz"
.
format
(
day
)
)
# print('Loading file: ', filepath)
with
np
.
load
(
filepath
)
as
data
:
x_int
=
data
[
"X_int"
]
x_cat
=
data
[
"X_cat"
]
y
=
data
[
"y"
]
samples_in_file
=
y
.
shape
[
0
]
batch_start_idx
=
0
if
split
==
"test"
or
split
==
"val"
:
length
=
int
(
np
.
ceil
(
samples_in_file
/
2.
))
if
split
==
"test"
:
samples_in_file
=
length
elif
split
==
"val"
:
batch_start_idx
=
samples_in_file
-
length
while
batch_start_idx
<
samples_in_file
-
batch_size
:
missing_samples
=
batch_size
if
previous_file
is
not
None
:
missing_samples
-=
previous_file
[
'y'
].
shape
[
0
]
current_slice
=
slice
(
batch_start_idx
,
batch_start_idx
+
missing_samples
)
x_int_batch
=
x_int
[
current_slice
]
x_cat_batch
=
x_cat
[
current_slice
]
y_batch
=
y
[
current_slice
]
if
previous_file
is
not
None
:
x_int_batch
=
np
.
concatenate
(
[
previous_file
[
'x_int'
],
x_int_batch
],
axis
=
0
)
x_cat_batch
=
np
.
concatenate
(
[
previous_file
[
'x_cat'
],
x_cat_batch
],
axis
=
0
)
y_batch
=
np
.
concatenate
([
previous_file
[
'y'
],
y_batch
],
axis
=
0
)
previous_file
=
None
if
x_int_batch
.
shape
[
0
]
!=
batch_size
:
raise
ValueError
(
'should not happen'
)
yield
_transform_features
(
x_int_batch
,
x_cat_batch
,
y_batch
,
max_ind_range
)
batch_start_idx
+=
missing_samples
if
batch_start_idx
!=
samples_in_file
:
current_slice
=
slice
(
batch_start_idx
,
samples_in_file
)
if
previous_file
is
not
None
:
previous_file
=
{
'x_int'
:
np
.
concatenate
(
[
previous_file
[
'x_int'
],
x_int
[
current_slice
]],
axis
=
0
),
'x_cat'
:
np
.
concatenate
(
[
previous_file
[
'x_cat'
],
x_cat
[
current_slice
]],
axis
=
0
),
'y'
:
np
.
concatenate
([
previous_file
[
'y'
],
y
[
current_slice
]],
axis
=
0
)
}
else
:
previous_file
=
{
'x_int'
:
x_int
[
current_slice
],
'x_cat'
:
x_cat
[
current_slice
],
'y'
:
y
[
current_slice
]
}
if
not
drop_last
:
yield
_transform_features
(
previous_file
[
'x_int'
],
previous_file
[
'x_cat'
],
previous_file
[
'y'
],
max_ind_range
)
def
_test
():
generator
=
_batch_generator
(
data_filename
=
'day'
,
data_directory
=
'./input'
,
days
=
range
(
23
),
split
=
"train"
,
batch_size
=
2048
,
drop_last
=
True
,
max_ind_range
=-
1
)
t1
=
time
.
time
()
for
x_int
,
lS_o
,
x_cat
,
y
in
generator
:
t2
=
time
.
time
()
time_diff
=
t2
-
t1
t1
=
t2
print
(
"time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}"
.
format
(
time_diff
,
x_int
.
shape
,
lS_o
.
shape
,
x_cat
.
shape
,
y
.
shape
)
)
class
CriteoBinDataset
(
Dataset
):
"""Binary version of criteo dataset."""
def
__init__
(
self
,
data_file
,
counts_file
,
batch_size
=
1
,
max_ind_range
=-
1
,
bytes_per_feature
=
4
):
# dataset
self
.
tar_fea
=
1
# single target
self
.
den_fea
=
13
# 13 dense features
self
.
spa_fea
=
26
# 26 sparse features
self
.
tad_fea
=
self
.
tar_fea
+
self
.
den_fea
self
.
tot_fea
=
self
.
tad_fea
+
self
.
spa_fea
self
.
batch_size
=
batch_size
self
.
max_ind_range
=
max_ind_range
self
.
bytes_per_entry
=
(
bytes_per_feature
*
self
.
tot_fea
*
batch_size
)
self
.
num_entries
=
math
.
ceil
(
os
.
path
.
getsize
(
data_file
)
/
self
.
bytes_per_entry
)
print
(
'data file:'
,
data_file
,
'number of batches:'
,
self
.
num_entries
)
self
.
file
=
open
(
data_file
,
'rb'
)
with
np
.
load
(
counts_file
)
as
data
:
self
.
counts
=
data
[
"counts"
]
# hardcoded for now
self
.
m_den
=
13
def
__len__
(
self
):
return
self
.
num_entries
def
__getitem__
(
self
,
idx
):
self
.
file
.
seek
(
idx
*
self
.
bytes_per_entry
,
0
)
raw_data
=
self
.
file
.
read
(
self
.
bytes_per_entry
)
array
=
np
.
frombuffer
(
raw_data
,
dtype
=
np
.
int32
)
tensor
=
torch
.
from_numpy
(
array
).
view
((
-
1
,
self
.
tot_fea
))
return
_transform_features
(
x_int_batch
=
tensor
[:,
1
:
14
],
x_cat_batch
=
tensor
[:,
14
:],
y_batch
=
tensor
[:,
0
],
max_ind_range
=
self
.
max_ind_range
,
flag_input_torch_tensor
=
True
)
def
__del__
(
self
):
self
.
file
.
close
()
def
numpy_to_binary
(
input_files
,
output_file_path
,
split
=
'train'
):
"""Convert the data to a binary format to be read with CriteoBinDataset."""
# WARNING - both categorical and numerical data must fit into int32 for
# the following code to work correctly
with
open
(
output_file_path
,
'wb'
)
as
output_file
:
if
split
==
'train'
:
for
input_file
in
input_files
:
print
(
'Processing file: '
,
input_file
)
np_data
=
np
.
load
(
input_file
)
np_data
=
np
.
concatenate
([
np_data
[
'y'
].
reshape
(
-
1
,
1
),
np_data
[
'X_int'
],
np_data
[
'X_cat'
]],
axis
=
1
)
np_data
=
np_data
.
astype
(
np
.
int32
)
output_file
.
write
(
np_data
.
tobytes
())
else
:
assert
len
(
input_files
)
==
1
np_data
=
np
.
load
(
input_files
[
0
])
np_data
=
np
.
concatenate
([
np_data
[
'y'
].
reshape
(
-
1
,
1
),
np_data
[
'X_int'
],
np_data
[
'X_cat'
]],
axis
=
1
)
np_data
=
np_data
.
astype
(
np
.
int32
)
samples_in_file
=
np_data
.
shape
[
0
]
midpoint
=
int
(
np
.
ceil
(
samples_in_file
/
2.
))
if
split
==
"test"
:
begin
=
0
end
=
midpoint
elif
split
==
"val"
:
begin
=
midpoint
end
=
samples_in_file
else
:
raise
ValueError
(
'Unknown split value: '
,
split
)
output_file
.
write
(
np_data
[
begin
:
end
].
tobytes
())
def
_preprocess
(
args
):
train_files
=
[
'{}_{}_reordered.npz'
.
format
(
args
.
input_data_prefix
,
day
)
for
day
in
range
(
0
,
23
)]
test_valid_file
=
args
.
input_data_prefix
+
'_23_reordered.npz'
os
.
makedirs
(
args
.
output_directory
,
exist_ok
=
True
)
for
split
in
[
'train'
,
'val'
,
'test'
]:
print
(
'Running preprocessing for split ='
,
split
)
output_file
=
os
.
path
.
join
(
args
.
output_directory
,
'{}_data.bin'
.
format
(
split
))
input_files
=
train_files
if
split
==
'train'
else
[
test_valid_file
]
numpy_to_binary
(
input_files
=
input_files
,
output_file_path
=
output_file
,
split
=
split
)
def
_test_bin
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--output_directory'
,
required
=
True
)
parser
.
add_argument
(
'--input_data_prefix'
,
required
=
True
)
parser
.
add_argument
(
'--split'
,
choices
=
[
'train'
,
'test'
,
'val'
],
required
=
True
)
args
=
parser
.
parse_args
()
_preprocess
(
args
)
binary_data_file
=
os
.
path
.
join
(
args
.
output_directory
,
'{}_data.bin'
.
format
(
args
.
split
))
counts_file
=
os
.
path
.
join
(
args
.
output_directory
,
'day_fea_count.npz'
)
dataset_binary
=
CriteoBinDataset
(
data_file
=
binary_data_file
,
counts_file
=
counts_file
,
batch_size
=
2048
,)
from
dlrm_data_pytorch
import
CriteoDataset
from
dlrm_data_pytorch
import
collate_wrapper_criteo_offset
as
collate_wrapper_criteo
binary_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset_binary
,
batch_size
=
None
,
shuffle
=
False
,
num_workers
=
0
,
collate_fn
=
None
,
pin_memory
=
False
,
drop_last
=
False
,
)
original_dataset
=
CriteoDataset
(
dataset
=
'terabyte'
,
max_ind_range
=
10
*
1000
*
1000
,
sub_sample_rate
=
1
,
randomize
=
True
,
split
=
args
.
split
,
raw_path
=
args
.
input_data_prefix
,
pro_data
=
'dummy_string'
,
memory_map
=
True
)
original_loader
=
torch
.
utils
.
data
.
DataLoader
(
original_dataset
,
batch_size
=
2048
,
shuffle
=
False
,
num_workers
=
0
,
collate_fn
=
collate_wrapper_criteo
,
pin_memory
=
False
,
drop_last
=
False
,
)
assert
len
(
dataset_binary
)
==
len
(
original_loader
)
for
i
,
(
old_batch
,
new_batch
)
in
tqdm
(
enumerate
(
zip
(
original_loader
,
binary_loader
)),
total
=
len
(
dataset_binary
)):
for
j
in
range
(
len
(
new_batch
)):
if
not
np
.
array_equal
(
old_batch
[
j
],
new_batch
[
j
]):
raise
ValueError
(
'FAILED: Datasets not equal'
)
if
i
>
len
(
dataset_binary
):
break
print
(
'PASSED'
)
if
__name__
==
'__main__'
:
_test
()
_test_bin
()
PyTorch/Recommendation/dlrm/data_utils.py
0 → 100644
View file @
56225fdf
This diff is collapsed.
Click to expand it.
PyTorch/Recommendation/dlrm/dlrm_data_caffe2.py
0 → 100644
View file @
56225fdf
This diff is collapsed.
Click to expand it.
PyTorch/Recommendation/dlrm/dlrm_data_pytorch.py
0 → 100644
View file @
56225fdf
This diff is collapsed.
Click to expand it.
PyTorch/Recommendation/dlrm/dlrm_s_caffe2.py
0 → 100644
View file @
56225fdf
This diff is collapsed.
Click to expand it.
PyTorch/Recommendation/dlrm/dlrm_s_pytorch.py
0 → 100644
View file @
56225fdf
This diff is collapsed.
Click to expand it.
PyTorch/Recommendation/dlrm/extend_distributed.py
0 → 100644
View file @
56225fdf
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
import
builtins
import
os
import
sys
import
torch
import
torch.distributed
as
dist
from
torch.autograd
import
Function
from
torch.autograd.profiler
import
record_function
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
try
:
import
torch_ccl
except
ImportError
as
e
:
# print(e)
torch_ccl
=
False
try
:
import
torch_ucc
except
ImportError
as
e
:
torch_ucc
=
False
my_rank
=
-
1
my_size
=
-
1
my_local_rank
=
-
1
my_local_size
=
-
1
alltoall_supported
=
False
a2a_impl
=
os
.
environ
.
get
(
"DLRM_ALLTOALL_IMPL"
,
""
)
myreq
=
None
def
env2int
(
env_list
,
default
=-
1
):
for
e
in
env_list
:
val
=
int
(
os
.
environ
.
get
(
e
,
-
1
))
if
val
>=
0
:
return
val
return
default
def
get_my_slice
(
n
):
k
,
m
=
divmod
(
n
,
my_size
)
return
slice
(
my_rank
*
k
+
min
(
my_rank
,
m
),
(
my_rank
+
1
)
*
k
+
min
(
my_rank
+
1
,
m
),
1
)
def
get_split_lengths
(
n
):
k
,
m
=
divmod
(
n
,
my_size
)
if
m
==
0
:
splits
=
None
my_len
=
k
else
:
splits
=
[(
k
+
1
)
if
i
<
m
else
k
for
i
in
range
(
my_size
)]
my_len
=
splits
[
my_rank
]
return
(
my_len
,
splits
)
def
init_distributed
(
rank
=-
1
,
local_rank
=-
1
,
size
=-
1
,
use_gpu
=
False
,
backend
=
""
):
global
myreq
global
my_rank
global
my_size
global
my_local_rank
global
my_local_size
global
a2a_impl
global
alltoall_supported
# guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
num_mpi_ranks
=
env2int
(
[
"PMI_SIZE"
,
"OMPI_COMM_WORLD_SIZE"
,
"MV2_COMM_WORLD_SIZE"
,
"WORLD_SIZE"
]
)
if
backend
==
""
and
num_mpi_ranks
>
1
:
if
torch_ccl
and
env2int
([
"CCL_WORKER_COUNT"
])
>
0
:
backend
=
"ccl"
elif
use_gpu
and
dist
.
is_nccl_available
():
backend
=
"nccl"
elif
dist
.
is_mpi_available
():
backend
=
"mpi"
else
:
print
(
"WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
)
backend
=
"gloo"
if
backend
!=
""
:
# guess Rank and size
if
rank
==
-
1
:
rank
=
env2int
(
[
"PMI_RANK"
,
"OMPI_COMM_WORLD_RANK"
,
"MV2_COMM_WORLD_RANK"
,
"RANK"
],
0
)
if
size
==
-
1
:
size
=
env2int
(
[
"PMI_SIZE"
,
"OMPI_COMM_WORLD_SIZE"
,
"MV2_COMM_WORLD_SIZE"
,
"WORLD_SIZE"
,
],
1
,
)
if
not
os
.
environ
.
get
(
"RANK"
,
None
)
and
rank
!=
-
1
:
os
.
environ
[
"RANK"
]
=
str
(
rank
)
if
not
os
.
environ
.
get
(
"WORLD_SIZE"
,
None
)
and
size
!=
-
1
:
os
.
environ
[
"WORLD_SIZE"
]
=
str
(
size
)
if
not
os
.
environ
.
get
(
"MASTER_PORT"
,
None
):
os
.
environ
[
"MASTER_PORT"
]
=
"29500"
if
not
os
.
environ
.
get
(
"MASTER_ADDR"
,
None
):
local_size
=
env2int
(
[
"MPI_LOCALNRANKS"
,
"OMPI_COMM_WORLD_LOCAL_SIZE"
,
"MV2_COMM_WORLD_LOCAL_SIZE"
,
],
1
,
)
if
local_size
!=
size
and
backend
!=
"mpi"
:
print
(
"Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
)
print
(
"If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
)
os
.
environ
[
"MASTER_ADDR"
]
=
"127.0.0.1"
if
size
>
1
:
if
local_rank
==
-
1
:
my_local_rank
=
env2int
(
[
"MPI_LOCALRANKID"
,
"OMPI_COMM_WORLD_LOCAL_RANK"
,
"MV2_COMM_WORLD_LOCAL_RANK"
,
"LOCAL_RANK"
,
],
0
,
)
else
:
my_local_rank
=
local_rank
my_local_size
=
env2int
(
[
"MPI_LOCALNRANKS"
,
"OMPI_COMM_WORLD_LOCAL_SIZE"
,
"MV2_COMM_WORLD_LOCAL_SIZE"
,
],
1
,
)
if
use_gpu
:
if
my_local_size
>
torch
.
cuda
.
device_count
():
print
(
"Not sufficient GPUs available... local_size = %d, ngpus = %d"
%
(
my_local_size
,
torch
.
cuda
.
device_count
())
)
sys
.
exit
(
1
)
torch
.
cuda
.
set_device
(
my_local_rank
)
dist
.
init_process_group
(
backend
,
rank
=
rank
,
world_size
=
size
)
my_rank
=
dist
.
get_rank
()
my_size
=
dist
.
get_world_size
()
if
my_rank
==
0
:
print
(
"Running on %d ranks using %s backend"
%
(
my_size
,
backend
))
if
hasattr
(
dist
,
"all_to_all_single"
):
try
:
t
=
torch
.
zeros
([
4
])
if
use_gpu
:
t
=
t
.
cuda
()
dist
.
all_to_all_single
(
t
,
t
)
alltoall_supported
=
True
except
RuntimeError
as
err
:
print
(
"fail to enable all_to_all_single primitive: %s"
%
err
)
if
a2a_impl
==
"alltoall"
and
alltoall_supported
==
False
:
print
(
"Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
%
(
a2a_impl
,
backend
)
)
a2a_impl
=
"scatter"
if
a2a_impl
!=
""
:
print
(
"Using DLRM_ALLTOALL_IMPL=%s"
%
a2a_impl
)
else
:
my_rank
=
0
my_size
=
1
my_local_rank
=
0
my_local_size
=
1
print_all
(
"world size: %d, current rank: %d, local rank: %d"
%
(
my_size
,
my_rank
,
my_local_rank
)
)
myreq
=
Request
()
class
Request
(
object
):
def
__init__
(
self
):
self
.
req
=
None
self
.
tensor
=
None
self
.
WaitFunction
=
All2All_Scatter_Wait
def
wait
(
self
):
ret
=
self
.
WaitFunction
.
apply
(
*
self
.
tensor
)
self
.
req
=
None
self
.
tensor
=
None
return
ret
class
All2All_ScatterList_Req
(
Function
):
@
staticmethod
def
forward
(
ctx
,
a2a_info
,
*
inputs
):
global
myreq
batch_split_lengths
=
(
a2a_info
.
global_batch_partition_slices
if
a2a_info
.
global_batch_partition_slices
else
a2a_info
.
local_batch_num
)
table_split_lengths
=
(
a2a_info
.
global_table_wise_parition_slices
if
a2a_info
.
global_table_wise_parition_slices
else
[
a2a_info
.
local_table_num
]
*
my_size
)
gather_list
=
[]
req_list
=
[]
for
i
in
range
(
my_size
):
for
j
in
range
(
table_split_lengths
[
i
]):
out_tensor
=
inputs
[
0
].
new_empty
(
[
a2a_info
.
local_batch_num
,
a2a_info
.
emb_dim
]
)
scatter_list
=
(
list
(
inputs
[
j
].
split
(
batch_split_lengths
,
dim
=
0
))
if
i
==
my_rank
else
[]
)
req
=
dist
.
scatter
(
out_tensor
,
scatter_list
,
src
=
i
,
async_op
=
True
)
gather_list
.
append
(
out_tensor
)
req_list
.
append
(
req
)
myreq
.
req
=
req_list
myreq
.
tensor
=
tuple
(
gather_list
)
myreq
.
a2a_info
=
a2a_info
return
myreq
.
tensor
@
staticmethod
def
backward
(
ctx
,
*
grad_output
):
global
myreq
for
r
in
myreq
.
req
:
r
.
wait
()
myreq
.
req
=
None
grad_inputs
=
myreq
.
tensor
myreq
.
tensor
=
None
return
(
None
,
*
grad_inputs
)
class
All2All_ScatterList_Wait
(
Function
):
@
staticmethod
def
forward
(
ctx
,
*
output
):
global
myreq
ctx
.
a2a_info
=
myreq
.
a2a_info
for
r
in
myreq
.
req
:
r
.
wait
()
myreq
.
req
=
None
myreq
.
tensor
=
None
return
output
@
staticmethod
def
backward
(
ctx
,
*
grad_output
):
global
myreq
a2a_info
=
ctx
.
a2a_info
grad_output
=
[
t
.
contiguous
()
for
t
in
grad_output
]
batch_split_lengths
=
(
a2a_info
.
global_batch_partition_slices
if
a2a_info
.
global_batch_partition_slices
else
[
a2a_info
.
local_batch_num
]
*
my_size
)
per_rank_table_splits
=
(
a2a_info
.
global_table_wise_parition_slices
if
a2a_info
.
global_table_wise_parition_slices
else
[
a2a_info
.
local_table_num
]
*
my_size
)
grad_inputs
=
[
grad_output
[
0
].
new_empty
([
ctx
.
a2a_info
.
batch_size
,
ctx
.
a2a_info
.
emb_dim
])
for
_
in
range
(
a2a_info
.
local_table_num
)
]
req_list
=
[]
ind
=
0
for
i
in
range
(
my_size
):
for
j
in
range
(
per_rank_table_splits
[
i
]):
gather_list
=
(
list
(
grad_inputs
[
j
].
split
(
batch_split_lengths
,
dim
=
0
))
if
i
==
my_rank
else
None
)
req
=
dist
.
gather
(
grad_output
[
ind
],
gather_list
,
dst
=
i
,
async_op
=
True
)
req_list
.
append
(
req
)
ind
+=
1
myreq
.
req
=
req_list
myreq
.
tensor
=
grad_inputs
return
tuple
(
grad_output
)
class
All2All_Scatter_Req
(
Function
):
@
staticmethod
def
forward
(
ctx
,
a2a_info
,
*
inputs
):
global
myreq
batch_split_lengths
=
(
a2a_info
.
global_batch_partition_slices
if
a2a_info
.
global_batch_partition_slices
else
a2a_info
.
local_batch_num
)
table_split_lengths
=
(
a2a_info
.
global_table_wise_parition_slices
if
a2a_info
.
global_table_wise_parition_slices
else
[
a2a_info
.
local_table_num
]
*
my_size
)
input
=
torch
.
cat
(
inputs
,
dim
=
1
)
scatter_list
=
list
(
input
.
split
(
batch_split_lengths
,
dim
=
0
))
gather_list
=
[]
req_list
=
[]
for
i
in
range
(
my_size
):
out_tensor
=
input
.
new_empty
(
[
a2a_info
.
local_batch_num
,
table_split_lengths
[
i
]
*
a2a_info
.
emb_dim
]
)
req
=
dist
.
scatter
(
out_tensor
,
scatter_list
if
i
==
my_rank
else
[],
src
=
i
,
async_op
=
True
)
gather_list
.
append
(
out_tensor
)
req_list
.
append
(
req
)
myreq
.
req
=
req_list
myreq
.
tensor
=
tuple
(
gather_list
)
myreq
.
a2a_info
=
a2a_info
ctx
.
a2a_info
=
a2a_info
return
myreq
.
tensor
@
staticmethod
def
backward
(
ctx
,
*
grad_output
):
global
myreq
for
r
in
myreq
.
req
:
r
.
wait
()
myreq
.
req
=
None
grad_input
=
myreq
.
tensor
grad_inputs
=
grad_input
.
split
(
ctx
.
a2a_info
.
emb_dim
,
dim
=
1
)
myreq
.
tensor
=
None
return
(
None
,
*
grad_inputs
)
class
All2All_Scatter_Wait
(
Function
):
@
staticmethod
def
forward
(
ctx
,
*
output
):
global
myreq
ctx
.
a2a_info
=
myreq
.
a2a_info
for
r
in
myreq
.
req
:
r
.
wait
()
myreq
.
req
=
None
myreq
.
tensor
=
None
return
output
@
staticmethod
def
backward
(
ctx
,
*
grad_output
):
global
myreq
assert
len
(
grad_output
)
==
my_size
scatter_list
=
[
t
.
contiguous
()
for
t
in
grad_output
]
a2a_info
=
ctx
.
a2a_info
batch_split_lengths
=
(
a2a_info
.
global_batch_partition_slices
if
a2a_info
.
global_batch_partition_slices
else
a2a_info
.
local_batch_num
)
table_split_lengths
=
(
a2a_info
.
global_table_wise_parition_slices
if
a2a_info
.
global_table_wise_parition_slices
else
[
a2a_info
.
local_table_num
]
*
my_size
)
grad_input
=
grad_output
[
0
].
new_empty
(
[
a2a_info
.
batch_size
,
a2a_info
.
emb_dim
*
a2a_info
.
local_table_num
]
)
gather_list
=
list
(
grad_input
.
split
(
batch_split_lengths
,
dim
=
0
))
req_list
=
[]
for
i
in
range
(
my_size
):
req
=
dist
.
gather
(
scatter_list
[
i
],
gather_list
if
i
==
my_rank
else
[],
dst
=
i
,
async_op
=
True
,
)
req_list
.
append
(
req
)
myreq
.
req
=
req_list
myreq
.
tensor
=
grad_input
return
grad_output
class
All2All_Req
(
Function
):
@
staticmethod
def
forward
(
ctx
,
a2a_info
,
*
inputs
):
global
myreq
with
record_function
(
"DLRM alltoall_req_fwd_single"
):
batch_split_lengths
=
a2a_info
.
global_batch_partition_slices
if
batch_split_lengths
:
batch_split_lengths
=
[
m
*
a2a_info
.
emb_dim
*
a2a_info
.
local_table_num
for
m
in
batch_split_lengths
]
table_split_lengths
=
a2a_info
.
global_table_wise_parition_slices
if
table_split_lengths
:
table_split_lengths
=
[
a2a_info
.
local_batch_num
*
e
*
a2a_info
.
emb_dim
for
e
in
table_split_lengths
]
input
=
torch
.
cat
(
inputs
,
dim
=
1
).
view
([
-
1
])
output
=
input
.
new_empty
(
[
a2a_info
.
global_table_num
*
a2a_info
.
local_batch_num
*
a2a_info
.
emb_dim
]
)
req
=
dist
.
all_to_all_single
(
output
,
input
,
table_split_lengths
,
batch_split_lengths
,
async_op
=
True
)
myreq
.
req
=
req
myreq
.
tensor
=
[]
myreq
.
tensor
.
append
(
output
)
myreq
.
tensor
=
tuple
(
myreq
.
tensor
)
a2a_info
.
batch_split_lengths
=
batch_split_lengths
a2a_info
.
table_split_lengths
=
table_split_lengths
myreq
.
a2a_info
=
a2a_info
ctx
.
a2a_info
=
a2a_info
return
myreq
.
tensor
@
staticmethod
def
backward
(
ctx
,
*
grad_output
):
global
myreq
with
record_function
(
"DLRM alltoall_req_bwd_single"
):
a2a_info
=
ctx
.
a2a_info
myreq
.
req
.
wait
()
myreq
.
req
=
None
grad_input
=
myreq
.
tensor
grad_inputs
=
grad_input
.
view
([
a2a_info
.
batch_size
,
-
1
]).
split
(
a2a_info
.
emb_dim
,
dim
=
1
)
grad_inputs
=
[
gin
.
contiguous
()
for
gin
in
grad_inputs
]
myreq
.
tensor
=
None
return
(
None
,
*
grad_inputs
)
class
All2All_Wait
(
Function
):
@
staticmethod
def
forward
(
ctx
,
*
output
):
global
myreq
with
record_function
(
"DLRM alltoall_wait_fwd_single"
):
a2a_info
=
myreq
.
a2a_info
ctx
.
a2a_info
=
a2a_info
myreq
.
req
.
wait
()
myreq
.
req
=
None
myreq
.
tensor
=
None
table_split_lengths
=
(
a2a_info
.
table_split_lengths
if
a2a_info
.
table_split_lengths
else
a2a_info
.
local_table_num
*
a2a_info
.
local_batch_num
*
a2a_info
.
emb_dim
)
outputs
=
output
[
0
].
split
(
table_split_lengths
)
outputs
=
tuple
(
[
out
.
view
([
a2a_info
.
local_batch_num
,
-
1
])
for
out
in
outputs
]
)
return
outputs
@
staticmethod
def
backward
(
ctx
,
*
grad_outputs
):
global
myreq
with
record_function
(
"DLRM alltoall_wait_bwd_single"
):
a2a_info
=
ctx
.
a2a_info
grad_outputs
=
[
gout
.
contiguous
().
view
([
-
1
])
for
gout
in
grad_outputs
]
grad_output
=
torch
.
cat
(
grad_outputs
)
grad_input
=
grad_output
.
new_empty
(
[
a2a_info
.
batch_size
*
a2a_info
.
local_table_num
*
a2a_info
.
emb_dim
]
)
req
=
dist
.
all_to_all_single
(
grad_input
,
grad_output
,
a2a_info
.
batch_split_lengths
,
a2a_info
.
table_split_lengths
,
async_op
=
True
,
)
myreq
.
req
=
req
myreq
.
tensor
=
grad_input
return
(
grad_output
,)
class
AllGather
(
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
global_lengths
,
dim
=
0
):
if
not
isinstance
(
global_lengths
,
(
list
,
tuple
)):
global_lengths
=
[
global_lengths
]
*
my_size
assert
len
(
global_lengths
)
==
my_size
assert
global_lengths
[
my_rank
]
==
input
.
size
(
dim
)
local_start
=
sum
(
global_lengths
[:
my_rank
])
output_size
=
list
(
input
.
size
())
ctx
.
dim
=
dim
ctx
.
local_start
=
local_start
ctx
.
local_length
=
global_lengths
[
my_rank
]
input
=
input
.
contiguous
()
if
dim
==
0
:
out_len
=
sum
(
global_lengths
)
output_size
[
dim
]
=
out_len
output
=
input
.
new_empty
(
output_size
)
gather_list
=
list
(
output
.
split
(
global_lengths
,
dim
=
0
))
else
:
gather_list
=
[
torch
.
empty_like
(
input
)
for
_
in
range
(
my_size
)]
gather_list
=
[]
for
length
in
global_lengths
:
output_size
[
dim
]
=
length
gather_list
.
append
(
input
.
new_empty
(
output_size
))
dist
.
all_gather
(
gather_list
,
input
)
if
dim
!=
0
:
output
=
torch
.
cat
(
gather_list
,
dim
=
dim
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
# print("Inside All2AllBackward")
dim
=
ctx
.
dim
start
=
ctx
.
local_start
length
=
ctx
.
local_length
grad_input
=
grad_output
.
narrow
(
dim
,
start
,
length
)
return
(
grad_input
,
None
,
None
)
class
All2AllInfo
(
object
):
pass
def
alltoall
(
inputs
,
per_rank_table_splits
):
global
myreq
batch_size
,
emb_dim
=
inputs
[
0
].
size
()
a2a_info
=
All2AllInfo
()
a2a_info
.
local_table_num
=
len
(
inputs
)
a2a_info
.
global_table_wise_parition_slices
=
per_rank_table_splits
(
a2a_info
.
local_batch_num
,
a2a_info
.
global_batch_partition_slices
,
)
=
get_split_lengths
(
batch_size
)
a2a_info
.
emb_dim
=
emb_dim
a2a_info
.
batch_size
=
batch_size
a2a_info
.
global_table_num
=
(
sum
(
per_rank_table_splits
)
if
per_rank_table_splits
else
a2a_info
.
local_table_num
*
my_size
)
if
a2a_impl
==
""
and
alltoall_supported
or
a2a_impl
==
"alltoall"
:
# print("Using All2All_Req")
output
=
All2All_Req
.
apply
(
a2a_info
,
*
inputs
)
myreq
.
WaitFunction
=
All2All_Wait
elif
a2a_impl
==
""
or
a2a_impl
==
"scatter"
:
# print("Using All2All_Scatter_Req")
output
=
All2All_Scatter_Req
.
apply
(
a2a_info
,
*
inputs
)
myreq
.
WaitFunction
=
All2All_Scatter_Wait
elif
a2a_impl
==
"scatter_list"
:
# print("Using All2All_ScatterList_Req")
output
=
All2All_ScatterList_Req
.
apply
(
a2a_info
,
*
inputs
)
myreq
.
WaitFunction
=
All2All_ScatterList_Wait
else
:
print
(
"Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
"please use one of [alltoall, scatter, scatter_list]"
%
a2a_impl
)
return
myreq
def
all_gather
(
input
,
lengths
,
dim
=
0
):
if
not
lengths
:
lengths
=
[
input
.
size
(
0
)]
*
my_size
return
AllGather
.
apply
(
input
,
lengths
,
dim
)
def
barrier
():
if
my_size
>
1
:
dist
.
barrier
()
# Override builtin print function to print only from rank 0
orig_print
=
builtins
.
print
def
rank0_print
(
*
args
,
**
kwargs
):
if
my_rank
<=
0
or
kwargs
.
get
(
"print_all"
,
False
):
orig_print
(
*
args
,
**
kwargs
)
builtins
.
print
=
rank0_print
# Allow printing from all rank with explicit print_all
def
print_all
(
*
args
,
**
kwargs
):
orig_print
(
*
args
,
**
kwargs
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment