"mmdet3d/datasets/transforms/__init__.py" did not exist on "bdb3c14d6ce427d8de147c20cb6521e3cf3b8ed6"
Commit 9c8a2a14 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3002 canceled with stages
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
Please read the [full text](https://code.fb.com/codeofconduct/)
so that you can understand what actions will and will not be tolerated.
# Contributing to DLRM
We want to make contributing to this project as easy and transparent as
possible.
## Pull Requests
We actively welcome your pull requests.
1. Fork the repo and create your branch from `main`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
* 4 spaces for indentation rather than tabs
* 80 character line length
* in general, please maintain a consistent style with the rest of the code
## License
By contributing to DLRM, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
FROM ${FROM_IMAGE_NAME}
ADD requirements.txt .
RUN pip install -r requirements.txt
RUN pip install torch==1.3.1
WORKDIR /code
ADD . .
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
DLRM:Deep Learning Recommendation Model for Personalization and Recommendation Systems
=================================================================================
简介
------------
一个深度学习推荐模型(DLRM)的实现。
模型的输入由稠密特征和稀疏特征组成。前者是一个浮点值向量;后者是一组稀疏索引,用于查找嵌入表中的向量(这些嵌入表由浮点向量组成)。
选取到的这些向量会被送入若干 多层感知机(MLP)网络(通常在示意图中用三角形表示),在某些情况下,这些向量之间还会通过特定的算子(Ops)进行交互
```
output:
probability of a click
model: |
/\
/__\
|
_____________________> Op <___________________
/ | \
/\ /\ /\
/__\ /__\ ... /__\
| | |
| Op Op
| ____/__\_____ ____/__\____
| |_Emb_|____|__| ... |_Emb_|__|___|
input:
[ dense features ] [sparse indices] , ..., [sparse indices]
```
对模型各层的更精确定义:
1)MLP(多层感知机)的全连接层
z = f(y)
y = Wx + b
2)嵌入查找(针对一组稀疏索引 p=[p1,...,pk]p = [p_1, ..., p_k]p=[p1,...,pk])
z = Op(e1,...,ek)
obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
3)算子 Op 可以是以下几种之一
Sum(e1,...,ek) = e1 + ... + ek
Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
Cat(e1,...,ek) = [e1', ..., ek']'
where ' denotes transpose operation
部署
--------------
### Docker
**容器创建**
```bash
docker run --shm-size 500g --network=host --name=dlrm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /path/to/workspace/:/path/to/workspace/ -v /opt/hyhal:/opt/hyhal:ro -it image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10 bash
```
**依赖安装**
```bash
cd dlrm
pip install -r requirements.txt
pip install tensorboard
```
注意:使用 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 会导致 `torchrec-nightly` 相关依赖安装失败
Demo
--------------------
1)使用微型模型运行代码
```bash
python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
```
<img src="./images/image1.png" width="900">
2)在调试模式下使用微型模型运行代码
```bash
python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
```
<img src="./images/image2.png" width="900">
测试
-------
验证代码功能正确性
```bash
./test/dlrm_s_test.sh
```
<img src="./images/image3.png" width="900">
基准测试
------------
1)性能基准测试
```bash
./bench/dlrm_s_benchmark.sh
```
2)代码支持数据集 [Criteo Kaggle Display Advertising Challenge Dataset](https://ailab.criteo.com/ressources/)
- 请按以下步骤准备数据,以便在 DLRM 代码中使用:
- 首先,指定下载好的原始数据文件(train.txt),使用参数 `--raw-data-file=<path/train.txt>`
- 然后对数据进行预处理(分类、跨天合并等),以便在 DLRM 代码中使用
- 预处理后的数据会存储为 `*.npz` 文件,路径为 `<root_dir>/input/*.npz`
- 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用,参数为 `--processed-data-file=<path/*.npz>`
- 可以使用以下脚本对模型进行训练
```bash
./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
```
若要启用gpu,添加参数 `--use-gpu`,若要启用纯推理模型,添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
3)代码支持数据集 [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
- 请按以下步骤准备数据,以便在 DLRM 代码中使用:
- 首先,下载原始数据文件 `day_0.gz``day_23.gz` 并解压
- 使用参数 `--raw-data-file=<path/day>` 指定解压后的文本文件位置 `day_0``day_23`(天数会自动追加)
- 然后对数据进行预处理(分类、跨天合并等),以便在 DLRM 代码中使用
- 预处理后的数据会存储为 `*.npz` 文件,路径为 `<root_dir>/input/*.npz`
- 预处理后的文件 (`*.npz`) 可以在后续运行中直接使用,参数为 `--processed-data-file=<path/*.npz>`
- 可以使用以下脚本对模型进行训练
```bash
./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"]
```
​ 若要启用gpu,添加参数 `--use-gpu`,若要启用纯推理模型,添加参数 `--inference-only` 并使用参数 `--load-model`指定权重文件
- 对应的预训练模型可从以下链接下载:[dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)
<img src="./terabyte_0875_loss_accuracy_plots.png" width="900" height="320">
4)代码支持 [MLPerf benchmark](https://mlperf.org).
- 请参考以下训练参数
```bash
--mlperf-logging 用于跟踪多个指标,包括曲线下面积(AUC)
--mlperf-acc-threshold 允许基于准确率指标提前停止训练
--mlperf-auc-threshold 允许基于 AUC 指标提前停止训练
--mlperf-bin-loader 启用将数据预处理成单个二进制文件
--mlperf-bin-shuffle 控制是否对小批量数据进行随机打乱
```
- MLPerf 模型可使用以下脚本进行训练。
```bash
./bench/run_and_time.sh [--use-gpu]
```
- 对应的预训练模型可从以下链接下载:[dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt)
5)该代码现在支持同步分布式训练,支持 gloo/nccl/mpi 后端,同时提供了 [PyTorch 分布式启动器](https://pytorch.org/docs/stable/distributed.html#launch-utility) 和 Mpirun 的启动方式。对于 MPI,用户需要自行编写 MPI 启动脚本来配置运行主机。例如,使用 PyTorch 分布式启动器,可以使用如下命令作为启动脚本:
```bash
# 在单节点 8 GPU 环境下,使用 NCCL 作为后端处理随机生成的数据集时:
python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
# 对于多节点环境,用户可以根据启动器手册添加相关参数,例如:
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
```
模型检查点保存/加载
-------------------------------
在训练过程中,可以使用参数 `--save-model=<path/model.pt>` 保存模型
当测试准确率有所提升时(按 `--test-freq` 指定的间隔检查),模型会被保存
已保存的模型可以通过 `--load-model=<path/model.pt>` 加载
加载后,模型可以用于继续训练,已保存的模型相当于一个检查点或者,也可以通过指定 `--inference-only` 选项,仅使用保存的模型在测试数据集上进行评估
参考资料
-------
https://github.com/facebookresearch/dlrm
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
cpu=1
gpu=1
pt=1
ncores=28 #12 #6
nsockets="0"
ngpus="1 2 4 8"
numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
dlrm_pt_bin="python dlrm_s_pytorch.py"
data=random #synthetic
print_freq=100
rand_seed=727
#Model param
mb_size=2048 #1024 #512 #256
nbatches=1000 #500 #100
bot_mlp="512-512-64"
top_mlp="1024-1024-1024-1"
emb_size=64
nindices=100
emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
interaction="dot"
tnworkers=0
tmb_size=16384
#_args="--mini-batch-size="${mb_size}\
_args=" --num-batches="${nbatches}\
" --data-generation="${data}\
" --arch-mlp-bot="${bot_mlp}\
" --arch-mlp-top="${top_mlp}\
" --arch-sparse-feature-size="${emb_size}\
" --arch-embedding-size="${emb}\
" --num-indices-per-lookup="${nindices}\
" --arch-interaction-op="${interaction}\
" --numpy-rand-seed="${rand_seed}\
" --print-freq="${print_freq}\
" --print-time"\
" --enable-profiling "
# CPU Benchmarking
if [ $cpu = 1 ]; then
echo "--------------------------------------------"
echo "CPU Benchmarking - running on $ncores cores"
echo "--------------------------------------------"
if [ $pt = 1 ]; then
outf="model1_CPU_PT_$ncores.log"
outp="dlrm_s_pytorch.prof"
echo "-------------------------------"
echo "Running PT (log file: $outf)"
echo "-------------------------------"
cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file(s)
mv $outp ${outf//".log"/".prof"}
mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
fi
fi
# GPU Benchmarking
if [ $gpu = 1 ]; then
echo "--------------------------------------------"
echo "GPU Benchmarking - running on $ngpus GPUs"
echo "--------------------------------------------"
for _ng in $ngpus
do
# weak scaling
# _mb_size=$((mb_size*_ng))
# strong scaling
_mb_size=$((mb_size*1))
_gpus=$(seq -s, 0 $((_ng-1)))
cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
echo "-------------------"
echo "Using GPUS: "$_gpus
echo "-------------------"
if [ $pt = 1 ]; then
outf="model1_GPU_PT_$_ng.log"
outp="dlrm_s_pytorch.prof"
echo "-------------------------------"
echo "Running PT (log file: $outf)"
echo "-------------------------------"
cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
echo $cmd
eval $cmd
min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
echo "Min time per iteration = $min"
# move profiling file(s)
mv $outp ${outf//".log"/".prof"}
mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
fi
done
fi
\ No newline at end of file
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_pt_bin="python dlrm_s_pytorch.py"
echo "run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
echo "done"
\ No newline at end of file
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_pt_bin="python dlrm_s_pytorch.py"
echo "run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
echo "done"
\ No newline at end of file
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
echo "done"
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: compile .so from python code
from __future__ import absolute_import, division, print_function, unicode_literals
from distutils.extension import Extension
from Cython.Build import cythonize
from setuptools import setup
ext_modules = [
Extension(
"data_utils_cython",
["data_utils_cython.pyx"],
extra_compile_args=["-O3"],
extra_link_args=["-O3"],
)
]
setup(name="data_utils_cython", ext_modules=cythonize(ext_modules))
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: run dataset pre-processing in standalone mode
# WARNING: These steps are required to work with Cython
# 1. Instal Cython
# > sudo yum install Cython
# 2. Please copy data_utils.py into data_utils_cython.pyx
# 3. Compile the data_utils_cython.pyx to generate .so
# (it's important to keep extension .pyx rather than .py
# to ensure the C/C++ .so no .py is loaded at import time)
# > python cython_compile.py build_ext --inplace
# This should create data_utils_cython.so, which can be loaded below with "import"
# 4. Run standalone datatset preprocessing to generate .npz files
# a. Kaggle
# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
# --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
# b. Terabyte
# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
# --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
from __future__ import absolute_import, division, print_function, unicode_literals
import data_utils_cython as duc
if __name__ == "__main__":
### import packages ###
import argparse
### parse arguments ###
parser = argparse.ArgumentParser(description="Preprocess Criteo dataset")
# model related parameters
parser.add_argument("--max-ind-range", type=int, default=-1)
parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
parser.add_argument("--data-randomize", type=str, default="total") # or day or none
parser.add_argument("--memory-map", action="store_true", default=False)
parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
parser.add_argument("--raw-data-file", type=str, default="")
parser.add_argument("--processed-data-file", type=str, default="")
args = parser.parse_args()
duc.loadDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
)
# @lint-ignore-every LICENSELINT
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import math
import os
import time
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
class DataLoader:
"""
DataLoader dedicated for the Criteo Terabyte Click Logs dataset
"""
def __init__(
self,
data_filename,
data_directory,
days,
batch_size,
max_ind_range=-1,
split="train",
drop_last_batch=False,
):
self.data_filename = data_filename
self.data_directory = data_directory
self.days = days
self.batch_size = batch_size
self.max_ind_range = max_ind_range
total_file = os.path.join(data_directory, data_filename + "_day_count.npz")
with np.load(total_file) as data:
total_per_file = data["total_per_file"][np.array(days)]
self.length = sum(total_per_file)
if split == "test" or split == "val":
self.length = int(np.ceil(self.length / 2.0))
self.split = split
self.drop_last_batch = drop_last_batch
def __iter__(self):
return iter(
_batch_generator(
self.data_filename,
self.data_directory,
self.days,
self.batch_size,
self.split,
self.drop_last_batch,
self.max_ind_range,
)
)
def __len__(self):
if self.drop_last_batch:
return self.length // self.batch_size
else:
return math.ceil(self.length / self.batch_size)
def _transform_features(
x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
):
if max_ind_range > 0:
x_cat_batch = x_cat_batch % max_ind_range
if flag_input_torch_tensor:
x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
else:
x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
batch_size = x_cat_batch.shape[0]
feature_count = x_cat_batch.shape[1]
lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
def _batch_generator(
data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
):
previous_file = None
for day in days:
filepath = os.path.join(
data_directory, data_filename + "_{}_reordered.npz".format(day)
)
# print('Loading file: ', filepath)
with np.load(filepath) as data:
x_int = data["X_int"]
x_cat = data["X_cat"]
y = data["y"]
samples_in_file = y.shape[0]
batch_start_idx = 0
if split == "test" or split == "val":
length = int(np.ceil(samples_in_file / 2.0))
if split == "test":
samples_in_file = length
elif split == "val":
batch_start_idx = samples_in_file - length
while batch_start_idx < samples_in_file - batch_size:
missing_samples = batch_size
if previous_file is not None:
missing_samples -= previous_file["y"].shape[0]
current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
x_int_batch = x_int[current_slice]
x_cat_batch = x_cat[current_slice]
y_batch = y[current_slice]
if previous_file is not None:
x_int_batch = np.concatenate(
[previous_file["x_int"], x_int_batch], axis=0
)
x_cat_batch = np.concatenate(
[previous_file["x_cat"], x_cat_batch], axis=0
)
y_batch = np.concatenate([previous_file["y"], y_batch], axis=0)
previous_file = None
if x_int_batch.shape[0] != batch_size:
raise ValueError("should not happen")
yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
batch_start_idx += missing_samples
if batch_start_idx != samples_in_file:
current_slice = slice(batch_start_idx, samples_in_file)
if previous_file is not None:
previous_file = {
"x_int": np.concatenate(
[previous_file["x_int"], x_int[current_slice]], axis=0
),
"x_cat": np.concatenate(
[previous_file["x_cat"], x_cat[current_slice]], axis=0
),
"y": np.concatenate([previous_file["y"], y[current_slice]], axis=0),
}
else:
previous_file = {
"x_int": x_int[current_slice],
"x_cat": x_cat[current_slice],
"y": y[current_slice],
}
if not drop_last:
yield _transform_features(
previous_file["x_int"],
previous_file["x_cat"],
previous_file["y"],
max_ind_range,
)
def _test():
generator = _batch_generator(
data_filename="day",
data_directory="./input",
days=range(23),
split="train",
batch_size=2048,
drop_last=True,
max_ind_range=-1,
)
t1 = time.time()
for x_int, lS_o, x_cat, y in generator:
t2 = time.time()
time_diff = t2 - t1
t1 = t2
print(
"time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
)
)
class CriteoBinDataset(Dataset):
"""Binary version of criteo dataset."""
def __init__(
self,
data_file,
counts_file,
batch_size=1,
max_ind_range=-1,
bytes_per_feature=4,
):
# dataset
self.tar_fea = 1 # single target
self.den_fea = 13 # 13 dense features
self.spa_fea = 26 # 26 sparse features
self.tad_fea = self.tar_fea + self.den_fea
self.tot_fea = self.tad_fea + self.spa_fea
self.batch_size = batch_size
self.max_ind_range = max_ind_range
self.bytes_per_entry = bytes_per_feature * self.tot_fea * batch_size
self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
print("data file:", data_file, "number of batches:", self.num_entries)
self.file = open(data_file, "rb")
with np.load(counts_file) as data:
self.counts = data["counts"]
# hardcoded for now
self.m_den = 13
def __len__(self):
return self.num_entries
def __getitem__(self, idx):
self.file.seek(idx * self.bytes_per_entry, 0)
raw_data = self.file.read(self.bytes_per_entry)
array = np.frombuffer(raw_data, dtype=np.int32)
tensor = torch.from_numpy(array).view((-1, self.tot_fea))
return _transform_features(
x_int_batch=tensor[:, 1:14],
x_cat_batch=tensor[:, 14:],
y_batch=tensor[:, 0],
max_ind_range=self.max_ind_range,
flag_input_torch_tensor=True,
)
def __del__(self):
self.file.close()
def numpy_to_binary(input_files, output_file_path, split="train"):
"""Convert the data to a binary format to be read with CriteoBinDataset."""
# WARNING - both categorical and numerical data must fit into int32 for
# the following code to work correctly
with open(output_file_path, "wb") as output_file:
if split == "train":
for input_file in input_files:
print("Processing file: ", input_file)
np_data = np.load(input_file)
np_data = np.concatenate(
[np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
axis=1,
)
np_data = np_data.astype(np.int32)
output_file.write(np_data.tobytes())
else:
assert len(input_files) == 1
np_data = np.load(input_files[0])
np_data = np.concatenate(
[np_data["y"].reshape(-1, 1), np_data["X_int"], np_data["X_cat"]],
axis=1,
)
np_data = np_data.astype(np.int32)
samples_in_file = np_data.shape[0]
midpoint = int(np.ceil(samples_in_file / 2.0))
if split == "test":
begin = 0
end = midpoint
elif split == "val":
begin = midpoint
end = samples_in_file
else:
raise ValueError("Unknown split value: ", split)
output_file.write(np_data[begin:end].tobytes())
def _preprocess(args):
train_files = [
"{}_{}_reordered.npz".format(args.input_data_prefix, day)
for day in range(0, 23)
]
test_valid_file = args.input_data_prefix + "_23_reordered.npz"
os.makedirs(args.output_directory, exist_ok=True)
for split in ["train", "val", "test"]:
print("Running preprocessing for split =", split)
output_file = os.path.join(args.output_directory, "{}_data.bin".format(split))
input_files = train_files if split == "train" else [test_valid_file]
numpy_to_binary(
input_files=input_files, output_file_path=output_file, split=split
)
def _test_bin():
parser = argparse.ArgumentParser()
parser.add_argument("--output_directory", required=True)
parser.add_argument("--input_data_prefix", required=True)
parser.add_argument("--split", choices=["train", "test", "val"], required=True)
args = parser.parse_args()
_preprocess(args)
binary_data_file = os.path.join(
args.output_directory, "{}_data.bin".format(args.split)
)
counts_file = os.path.join(args.output_directory, "day_fea_count.npz")
dataset_binary = CriteoBinDataset(
data_file=binary_data_file,
counts_file=counts_file,
batch_size=2048,
)
from dlrm_data_pytorch import (
collate_wrapper_criteo_offset as collate_wrapper_criteo,
CriteoDataset,
)
binary_loader = torch.utils.data.DataLoader(
dataset_binary,
batch_size=None,
shuffle=False,
num_workers=0,
collate_fn=None,
pin_memory=False,
drop_last=False,
)
original_dataset = CriteoDataset(
dataset="terabyte",
max_ind_range=10 * 1000 * 1000,
sub_sample_rate=1,
randomize=True,
split=args.split,
raw_path=args.input_data_prefix,
pro_data="dummy_string",
memory_map=True,
)
original_loader = torch.utils.data.DataLoader(
original_dataset,
batch_size=2048,
shuffle=False,
num_workers=0,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False,
)
assert len(dataset_binary) == len(original_loader)
for i, (old_batch, new_batch) in tqdm(
enumerate(zip(original_loader, binary_loader)), total=len(dataset_binary)
):
for j in range(len(new_batch)):
if not np.array_equal(old_batch[j], new_batch[j]):
raise ValueError("FAILED: Datasets not equal")
if i > len(dataset_binary):
break
print("PASSED")
if __name__ == "__main__":
_test()
_test_bin()
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: generate inputs and targets for the DLRM benchmark
#
# Utility function(s) to download and pre-process public data sets
# - Criteo Kaggle Display Advertising Challenge Dataset
# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
# - Criteo Terabyte Dataset
# https://labs.criteo.com/2013/12/download-terabyte-click-logs
#
# After downloading dataset, run:
# getCriteoAdData(
# datafile="<path-to-train.txt>",
# o_filename=kaggleAdDisplayChallenge_processed.npz,
# max_ind_range=-1,
# sub_sample_rate=0.0,
# days=7,
# data_split='train',
# randomize='total',
# criteo_kaggle=True,
# memory_map=False
# )
# getCriteoAdData(
# datafile="<path-to-day_{0,...,23}>",
# o_filename=terabyte_processed.npz,
# max_ind_range=-1,
# sub_sample_rate=0.0,
# days=24,
# data_split='train',
# randomize='total',
# criteo_kaggle=False,
# memory_map=False
# )
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
from multiprocessing import Manager, Process
# import os
from os import path
# import io
# from io import StringIO
# import collections as coll
import numpy as np
def convertUStringToDistinctIntsDict(mat, convertDicts, counts):
# Converts matrix of unicode strings into distinct integers.
#
# Inputs:
# mat (np.array): array of unicode strings to convert
# convertDicts (list): dictionary for each column
# counts (list): number of different categories in each column
#
# Outputs:
# out (np.array): array of output integers
# convertDicts (list): dictionary for each column
# counts (list): number of different categories in each column
# check if convertDicts and counts match correct length of mat
if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]:
print("Length of convertDicts or counts does not match input shape")
print("Generating convertDicts and counts...")
convertDicts = [{} for _ in range(mat.shape[1])]
counts = [0 for _ in range(mat.shape[1])]
# initialize output
out = np.zeros(mat.shape)
for j in range(mat.shape[1]):
for i in range(mat.shape[0]):
# add to convertDict and increment count
if mat[i, j] not in convertDicts[j]:
convertDicts[j][mat[i, j]] = counts[j]
counts[j] += 1
out[i, j] = convertDicts[j][mat[i, j]]
return out, convertDicts, counts
def convertUStringToDistinctIntsUnique(mat, mat_uni, counts):
# mat is an array of 0,...,# samples, with each being 26 categorical features
# check if mat_unique and counts match correct length of mat
if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]:
print("Length of mat_unique or counts does not match input shape")
print("Generating mat_unique and counts...")
mat_uni = [np.array([]) for _ in range(mat.shape[1])]
counts = [0 for _ in range(mat.shape[1])]
# initialize output
out = np.zeros(mat.shape)
ind_map = [np.array([]) for _ in range(mat.shape[1])]
# find out and assign unique ids to features
for j in range(mat.shape[1]):
m = mat_uni[j].size
mat_concat = np.concatenate((mat_uni[j], mat[:, j]))
mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True)
out[:, j] = ind_map[j][m:]
counts[j] = mat_uni[j].size
return out, mat_uni, counts
def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
# Process Kaggle Display Advertising Challenge or Terabyte Dataset
# by converting unicode strings in X_cat to integers and
# converting negative integer values in X_int.
#
# Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
#
# Inputs:
# d_path (str): path for {kaggle|terabyte}_day_i.npz files
# i (int): splits in the dataset (typically 0 to 7 or 0 to 24)
# process data if not all files exist
filename_i = npzfile + "_{0}_processed.npz".format(i)
if path.exists(filename_i):
print("Using existing " + filename_i, end="\n")
else:
print("Not existing " + filename_i)
with np.load(npzfile + "_{0}.npz".format(i)) as data:
# categorical features
"""
# Approach 1a: using empty dictionaries
X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
data["X_cat"], convertDicts, counts
)
"""
"""
# Approach 1b: using empty np.unique
X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
data["X_cat"], convertDicts, counts
)
"""
# Approach 2a: using pre-computed dictionaries
X_cat_t = np.zeros(data["X_cat_t"].shape)
for j in range(26):
for k, x in enumerate(data["X_cat_t"][j, :]):
X_cat_t[j, k] = convertDicts[j][x]
# continuous features
X_int = data["X_int"]
X_int[X_int < 0] = 0
# targets
y = data["y"]
np.savez_compressed(
filename_i,
# X_cat = X_cat,
X_cat=np.transpose(X_cat_t), # transpose of the data
X_int=X_int,
y=y,
)
print("Processed " + filename_i, end="\n")
# sanity check (applicable only if counts have been pre-computed & are re-computed)
# for j in range(26):
# if pre_comp_counts[j] != counts[j]:
# sys.exit("ERROR: Sanity check on counts has failed")
# print("\nSanity check on counts passed")
return
def concatCriteoAdData(
d_path,
d_file,
npzfile,
trafile,
days,
data_split,
randomize,
total_per_file,
total_count,
memory_map,
o_filename,
):
# Concatenates different days and saves the result.
#
# Inputs:
# days (int): total number of days in the dataset (typically 7 or 24)
# d_path (str): path for {kaggle|terabyte}_day_i.npz files
# o_filename (str): output file name
#
# Output:
# o_file (str): output file path
if memory_map:
# dataset break up per fea
# tar_fea = 1 # single target
den_fea = 13 # 13 dense features
spa_fea = 26 # 26 sparse features
# tad_fea = tar_fea + den_fea
# tot_fea = tad_fea + spa_fea
# create offset per file
offset_per_file = np.array([0] + [x for x in total_per_file])
for i in range(days):
offset_per_file[i + 1] += offset_per_file[i]
"""
# Approach 1, 2 and 3 use indices, while Approach 4 does not use them
# create indices
indices = np.arange(total_count)
if data_split == "none":
if randomize == "total":
indices = np.random.permutation(indices)
else:
indices = np.array_split(indices, offset_per_file[1:-1])
# randomize train data (per day)
if randomize == "day": # or randomize == "total":
for i in range(len(indices) - 1):
indices[i] = np.random.permutation(indices[i])
print("Randomized indices per day ...")
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
# randomize train data (across days)
if randomize == "total":
train_indices = np.random.permutation(train_indices)
print("Randomized indices across days ...")
indices = np.concatenate((train_indices, test_indices))
# no reordering
# indices = np.arange(total_count)
"""
"""
# Approach 1: simple and slow (no grouping is used)
# check if data already exists
recreate_flag = False
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
if path.exists(filename_j):
print("Using existing " + filename_j)
else:
recreate_flag = True
# load, reorder and concatenate data (memmap all reordered files per feature)
if recreate_flag:
# init reordered files (.npy appended automatically)
z = np.zeros((total_count))
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered".format(j)
np.save(filename_j, z)
print("Creating " + filename_j)
for i in range(days):
filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
with np.load(filename_i) as data:
X_cat_t = np.transpose(data["X_cat"])
X_int_t = np.transpose(data["X_int"])
y = data["y"]
size = len(y)
# sanity check
if total_per_file[i] != size:
sys.exit("ERROR: sanity check on number of samples failed")
# setup start and end ranges
start = offset_per_file[i]
end = offset_per_file[i + 1]
# print(filename_i)
# print("start=" + str(start) + " end=" + str(end)
# + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
fj = np.load(filename_j, mmap_mode='r+')
if j < tar_fea:
fj[indices[start:end]] = y
elif tar_fea <= j and j < tad_fea:
fj[indices[start:end]] = X_int_t[j - tar_fea, :]
else:
fj[indices[start:end]] = X_cat_t[j - tad_fea, :]
del fj
else:
print("Reordered fea files already exist, skipping ...")
# check if data already exists
recreate_flag = False
for i in range(days):
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
if path.exists(filename_i):
print("Using existing " + filename_i)
else:
recreate_flag = True
# split reordered data by files (memmap all reordered files per feature)
# on the day boundary del the file object and memmap again
if recreate_flag:
for i in range(days):
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
size = total_per_file[i]
X_int_t = np.zeros((den_fea, size))
X_cat_t = np.zeros((spa_fea, size))
# setup start and end ranges
start = offset_per_file[i]
end = offset_per_file[i + 1]
print("Creating " + filename_i)
# print("start=" + str(start) + " end=" + str(end)
# + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
fj = np.load(filename_j, mmap_mode='r')
if j < tar_fea:
y = fj[start:end]
elif tar_fea <= j and j < tad_fea:
X_int_t[j - tar_fea, :] = fj[start:end]
else:
X_cat_t[j - tad_fea, :] = fj[start:end]
del fj
np.savez_compressed(
filename_i,
X_cat=np.transpose(X_cat_t), # transpose of the data
X_int=np.transpose(X_int_t), # transpose of the data
y=y,
)
else:
print("Reordered day files already exist, skipping ...")
"""
"""
# Approach 2: group days
# check if data already exists
recreate_flag = False
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
if path.exists(filename_j):
print("Using existing " + filename_j)
else:
recreate_flag = True
# load, reorder and concatenate data (memmap all reordered files per feature)
if recreate_flag:
# init reordered files (.npy appended automatically)
z = np.zeros((total_count))
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered".format(j)
np.save(filename_j, z)
print("Creating " + filename_j)
group_day = 3 # e.g. 8, 4 or 3
group_num = days // group_day
file_group = [i*group_day for i in range(group_num)] + [days]
for ii in range(group_num):
# for last may be group_size != group_num, therefore reset it below
group_size = file_group[ii + 1] - file_group[ii]
X_cat_t = [0]*group_size
X_int_t = [0]*group_size
y = [0]*group_size
start = [0]*group_size
end = [0]*group_size
for ig in range(group_size):
i = file_group[ii] + ig
filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
# setup start and end ranges
start[ig] = offset_per_file[i]
end[ig] = offset_per_file[i + 1]
# print(filename_i)
# load a group of files
with np.load(filename_i) as data:
X_cat_t[ig] = np.transpose(data["X_cat"])
X_int_t[ig] = np.transpose(data["X_int"])
y[ig] = data["y"]
# sanity check
if total_per_file[i] != len(y[ig]):
sys.exit("ERROR: sanity check on number of samples failed")
# print("start=" + str(start) + " end=" + str(end)
# + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i]))
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
fj = np.load(filename_j, mmap_mode='r+')
for ig in range(group_size):
if j < tar_fea:
fj[indices[start[ig]:end[ig]]] = y[ig]
elif tar_fea <= j and j < tad_fea:
fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :]
else:
fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :]
del fj
else:
print("Reordered fea files already exist, skipping ...")
# check if data already exists
recreate_flag = False
for i in range(days):
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
if path.exists(filename_i):
print("Using existing " + filename_i)
else:
recreate_flag = True
# split reordered data by files (memmap all reordered files per feature)
# on the day boundary del the file object and memmap again
if recreate_flag:
for ii in range(group_num):
# for last may be group_size != group_num, therefore reset it below
group_size = file_group[ii + 1] - file_group[ii]
X_cat_t= []; X_int_t = []
for ig in range(group_size):
i = file_group[ii] + ig
X_int_t.append(np.zeros((den_fea, total_per_file[i])))
X_cat_t.append(np.zeros((spa_fea, total_per_file[i])))
y = [0]*group_size
start = [0]*group_size
end = [0]*group_size
for j in range(tot_fea):
filename_j = trafile + "_{0}_reordered.npy".format(j)
fj = np.load(filename_j, mmap_mode='r')
# load a group of files
for ig in range(group_size):
i = file_group[ii] + ig
# setup start and end ranges
start[ig] = offset_per_file[i]
end[ig] = offset_per_file[i + 1]
# load data for the group of files
if j < tar_fea:
y[ig] = fj[start[ig]:end[ig]]
elif tar_fea <= j and j < tad_fea:
X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]]
else:
X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]]
del fj
for ig in range(group_size):
i = file_group[ii] + ig
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
print("Creating " + filename_i)
np.savez_compressed(
filename_i,
X_cat=np.transpose(X_cat_t[ig]), # transpose of the data
X_int=np.transpose(X_int_t[ig]), # transpose of the data
y=y[ig],
)
else:
print("Reordered day files already exist, skipping ...")
"""
"""
# Approach 3: group features
# check if data already exists
group_fea = 5 # e.g. 8, 5 or 4
group_num = tot_fea // group_fea
if tot_fea % group_fea != 0: # sanity check
sys.exit("ERROR: the group_fea must divided tot_fea evenly.")
recreate_flag = False
for jn in range(group_num):
filename_j = trafile + "_{0}_reordered{1}.npy".format(
jn, group_fea
)
if path.exists(filename_j):
print("Using existing " + filename_j)
else:
recreate_flag = True
# load, reorder and concatenate data (memmap all reordered files per feature)
if recreate_flag:
# init reordered files (.npy appended automatically)
z = np.zeros((group_fea, total_count))
for jn in range(group_num):
filename_j = trafile + "_{0}_reordered{1}".format(
jn, group_fea
)
np.save(filename_j, z)
print("Creating " + filename_j)
for i in range(days):
filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
with np.load(filename_i) as data:
X_cat_t = np.transpose(data["X_cat"])
X_int_t = np.transpose(data["X_int"])
y = data["y"]
size = len(y)
# sanity check
if total_per_file[i] != size:
sys.exit("ERROR: sanity check on number of samples failed")
# setup start and end ranges
start = offset_per_file[i]
end = offset_per_file[i + 1]
# print(filename_i)
# print("start=" + str(start) + " end=" + str(end)
# + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
for jn in range(group_num):
filename_j = trafile + "_{0}_reordered{1}.npy".format(
jn, group_fea
)
fj = np.load(filename_j, mmap_mode='r+')
for jg in range(group_fea):
j = jn * group_fea + jg
# print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
if j < tar_fea:
fj[jg, indices[start:end]] = y
elif tar_fea <= j and j < tad_fea:
fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :]
else:
fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :]
del fj
else:
print("Reordered fea files already exist, skipping ...")
# check if data already exists
recreate_flag = False
for i in range(days):
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
if path.exists(filename_i):
print("Using existing" + filename_i)
else:
recreate_flag = True
# split reordered data by files (memmap all reordered files per feature)
# on the day boundary del the file object and memmap again
if recreate_flag:
for i in range(days):
filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
size = total_per_file[i]
X_int_t = np.zeros((den_fea, size))
X_cat_t = np.zeros((spa_fea, size))
# setup start and end ranges
start = offset_per_file[i]
end = offset_per_file[i + 1]
print("Creating " + filename_i)
# print("start=" + str(start) + " end=" + str(end)
# + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
for jn in range(group_num):
filename_j = trafile + "_{0}_reordered{1}.npy".format(
jn, group_fea
)
fj = np.load(filename_j, mmap_mode='r')
for jg in range(group_fea):
j = jn * group_fea + jg
# print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
if j < tar_fea:
y = fj[jg, start:end]
elif tar_fea <= j and j < tad_fea:
X_int_t[j - tar_fea, :] = fj[jg, start:end]
else:
X_cat_t[j - tad_fea, :] = fj[jg, start:end]
del fj
np.savez_compressed(
filename_i,
X_cat=np.transpose(X_cat_t), # transpose of the data
X_int=np.transpose(X_int_t), # transpose of the data
y=y,
)
else:
print("Reordered day files already exist, skipping ...")
"""
# Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm
# 1st pass of FYR shuffle
# check if data already exists
recreate_flag = False
for j in range(days):
filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
if (
path.exists(filename_j_y)
and path.exists(filename_j_d)
and path.exists(filename_j_s)
):
print(
"Using existing\n"
+ filename_j_y
+ "\n"
+ filename_j_d
+ "\n"
+ filename_j_s
)
else:
recreate_flag = True
# reorder across buckets using sampling
if recreate_flag:
# init intermediate files (.npy appended automatically)
for j in range(days):
filename_j_y = npzfile + "_{0}_intermediate_y".format(j)
filename_j_d = npzfile + "_{0}_intermediate_d".format(j)
filename_j_s = npzfile + "_{0}_intermediate_s".format(j)
np.save(filename_j_y, np.zeros((total_per_file[j])))
np.save(filename_j_d, np.zeros((total_per_file[j], den_fea)))
np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea)))
# start processing files
total_counter = [0] * days
for i in range(days):
filename_i = npzfile + "_{0}_processed.npz".format(i)
with np.load(filename_i) as data:
X_cat = data["X_cat"]
X_int = data["X_int"]
y = data["y"]
size = len(y)
# sanity check
if total_per_file[i] != size:
sys.exit("ERROR: sanity check on number of samples failed")
# debug prints
print("Reordering (1st pass) " + filename_i)
# create buckets using sampling of random ints
# from (discrete) uniform distribution
buckets = []
for _j in range(days):
buckets.append([])
counter = [0] * days
days_to_sample = days if data_split == "none" else days - 1
if randomize == "total":
rand_u = np.random.randint(low=0, high=days_to_sample, size=size)
for k in range(size):
# sample and make sure elements per buckets do not overflow
if data_split == "none" or i < days - 1:
# choose bucket
p = rand_u[k]
# retry of the bucket is full
while total_counter[p] + counter[p] >= total_per_file[p]:
p = np.random.randint(low=0, high=days_to_sample)
else: # preserve the last day/bucket if needed
p = i
buckets[p].append(k)
counter[p] += 1
else: # randomize is day or none
for k in range(size):
# do not sample, preserve the data in this bucket
p = i
buckets[p].append(k)
counter[p] += 1
# sanity check
if np.sum(counter) != size:
sys.exit("ERROR: sanity check on number of samples failed")
# debug prints
# print(counter)
# print(str(np.sum(counter)) + " = " + str(size))
# print([len(x) for x in buckets])
# print(total_counter)
# partially feel the buckets
for j in range(days):
filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
start = total_counter[j]
end = total_counter[j] + counter[j]
# target buckets
fj_y = np.load(filename_j_y, mmap_mode="r+")
# print("start=" + str(start) + " end=" + str(end)
# + " end - start=" + str(end - start) + " "
# + str(fj_y[start:end].shape) + " "
# + str(len(buckets[j])))
fj_y[start:end] = y[buckets[j]]
del fj_y
# dense buckets
fj_d = np.load(filename_j_d, mmap_mode="r+")
# print("start=" + str(start) + " end=" + str(end)
# + " end - start=" + str(end - start) + " "
# + str(fj_d[start:end, :].shape) + " "
# + str(len(buckets[j])))
fj_d[start:end, :] = X_int[buckets[j], :]
del fj_d
# sparse buckets
fj_s = np.load(filename_j_s, mmap_mode="r+")
# print("start=" + str(start) + " end=" + str(end)
# + " end - start=" + str(end - start) + " "
# + str(fj_s[start:end, :].shape) + " "
# + str(len(buckets[j])))
fj_s[start:end, :] = X_cat[buckets[j], :]
del fj_s
# update counters for next step
total_counter[j] += counter[j]
# 2nd pass of FYR shuffle
# check if data already exists
for j in range(days):
filename_j = npzfile + "_{0}_reordered.npz".format(j)
if path.exists(filename_j):
print("Using existing " + filename_j)
else:
recreate_flag = True
# reorder within buckets
if recreate_flag:
for j in range(days):
filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
fj_y = np.load(filename_j_y)
fj_d = np.load(filename_j_d)
fj_s = np.load(filename_j_s)
indices = range(total_per_file[j])
if randomize == "day" or randomize == "total":
if data_split == "none" or j < days - 1:
indices = np.random.permutation(range(total_per_file[j]))
filename_r = npzfile + "_{0}_reordered.npz".format(j)
print("Reordering (2nd pass) " + filename_r)
np.savez_compressed(
filename_r,
X_cat=fj_s[indices, :],
X_int=fj_d[indices, :],
y=fj_y[indices],
)
"""
# sanity check (under no reordering norms should be zero)
for i in range(days):
filename_i_o = npzfile + "_{0}_processed.npz".format(i)
print(filename_i_o)
with np.load(filename_i_o) as data_original:
X_cat_o = data_original["X_cat"]
X_int_o = data_original["X_int"]
y_o = data_original["y"]
filename_i_r = npzfile + "_{0}_reordered.npz".format(i)
print(filename_i_r)
with np.load(filename_i_r) as data_reordered:
X_cat_r = data_reordered["X_cat"]
X_int_r = data_reordered["X_int"]
y_r = data_reordered["y"]
print(np.linalg.norm(y_o - y_r))
print(np.linalg.norm(X_int_o - X_int_r))
print(np.linalg.norm(X_cat_o - X_cat_r))
"""
else:
print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename))
# load and concatenate data
for i in range(days):
filename_i = npzfile + "_{0}_processed.npz".format(i)
with np.load(filename_i) as data:
if i == 0:
X_cat = data["X_cat"]
X_int = data["X_int"]
y = data["y"]
else:
X_cat = np.concatenate((X_cat, data["X_cat"]))
X_int = np.concatenate((X_int, data["X_int"]))
y = np.concatenate((y, data["y"]))
print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0]))
with np.load(d_path + d_file + "_fea_count.npz") as data:
counts = data["counts"]
print("Loaded counts!")
np.savez_compressed(
d_path + o_filename + ".npz",
X_cat=X_cat,
X_int=X_int,
y=y,
counts=counts,
)
return d_path + o_filename + ".npz"
def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
# Transforms Criteo Kaggle or terabyte data by applying log transformation
# on dense features and converting everything to appropriate tensors.
#
# Inputs:
# X_cat (ndarray): array of integers corresponding to preprocessed
# categorical features
# X_int (ndarray): array of integers corresponding to dense features
# y (ndarray): array of bool corresponding to labels
# data_split(str): flag for splitting dataset into training/validation/test
# sets
# randomize (str): determines randomization scheme
# "none": no randomization
# "day": randomizes each day"s data (only works if split = True)
# "total": randomizes total dataset
#
# Outputs:
# if split:
# X_cat_train (tensor): sparse features for training set
# X_int_train (tensor): dense features for training set
# y_train (tensor): labels for training set
# X_cat_val (tensor): sparse features for validation set
# X_int_val (tensor): dense features for validation set
# y_val (tensor): labels for validation set
# X_cat_test (tensor): sparse features for test set
# X_int_test (tensor): dense features for test set
# y_test (tensor): labels for test set
# else:
# X_cat (tensor): sparse features
# X_int (tensor): dense features
# y (tensor): label
# define initial set of indices
indices = np.arange(len(y))
# create offset per file
offset_per_file = np.array([0] + [x for x in total_per_file])
for i in range(days):
offset_per_file[i + 1] += offset_per_file[i]
# split dataset
if data_split == "train":
indices = np.array_split(indices, offset_per_file[1:-1])
# randomize train data (per day)
if randomize == "day": # or randomize == "total":
for i in range(len(indices) - 1):
indices[i] = np.random.permutation(indices[i])
print("Randomized indices per day ...")
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
test_indices, val_indices = np.array_split(test_indices, 2)
print("Defined training and testing indices...")
# randomize train data (across days)
if randomize == "total":
train_indices = np.random.permutation(train_indices)
print("Randomized indices across days ...")
# indices = np.concatenate((train_indices, test_indices))
# create training, validation, and test sets
X_cat_train = X_cat[train_indices]
X_int_train = X_int[train_indices]
y_train = y[train_indices]
X_cat_val = X_cat[val_indices]
X_int_val = X_int[val_indices]
y_val = y[val_indices]
X_cat_test = X_cat[test_indices]
X_int_test = X_int[test_indices]
y_test = y[test_indices]
print("Split data according to indices...")
X_cat_train = X_cat_train.astype(int)
X_int_train = np.log(X_int_train.astype(np.float32) + 1)
y_train = y_train.astype(np.float32)
X_cat_val = X_cat_val.astype(int)
X_int_val = np.log(X_int_val.astype(np.float32) + 1)
y_val = y_val.astype(np.float32)
X_cat_test = X_cat_test.astype(int)
X_int_test = np.log(X_int_test.astype(np.float32) + 1)
y_test = y_test.astype(np.float32)
print("Converted to tensors...done!")
return (
X_cat_train,
X_int_train,
y_train,
X_cat_val,
X_int_val,
y_val,
X_cat_test,
X_int_test,
y_test,
)
else:
# randomize data
if randomize == "total":
indices = np.random.permutation(indices)
print("Randomized indices...")
X_cat = X_cat[indices].astype(int)
X_int = np.log(X_int[indices].astype(np.float32) + 1)
y = y[indices].astype(np.float32)
print("Converted to tensors...done!")
return (X_cat, X_int, y, [], [], [], [], [], [])
def getCriteoAdData(
datafile,
o_filename,
max_ind_range=-1,
sub_sample_rate=0.0,
days=7,
data_split="train",
randomize="total",
criteo_kaggle=True,
memory_map=False,
dataset_multiprocessing=False,
):
# Passes through entire dataset and defines dictionaries for categorical
# features and determines the number of total categories.
#
# Inputs:
# datafile : path to downloaded raw data file
# o_filename (str): saves results under o_filename if filename is not ""
#
# Output:
# o_file (str): output file path
# split the datafile into path and filename
lstr = datafile.split("/")
d_path = "/".join(lstr[0:-1]) + "/"
d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1]
npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file)
trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea")
# count number of datapoints in training set
total_file = d_path + d_file + "_day_count.npz"
if path.exists(total_file):
with np.load(total_file) as data:
total_per_file = list(data["total_per_file"])
total_count = np.sum(total_per_file)
print("Skipping counts per file (already exist)")
else:
total_count = 0
total_per_file = []
if criteo_kaggle:
# WARNING: The raw data consists of a single train.txt file
# Each line in the file is a sample, consisting of 13 continuous and
# 26 categorical features (an extra space indicates that feature is
# missing and will be interpreted as 0).
if path.exists(datafile):
print("Reading data from path=%s" % (datafile))
with open(str(datafile)) as f:
for _ in f:
total_count += 1
total_per_file.append(total_count)
# reset total per file due to split
num_data_per_split, extras = divmod(total_count, days)
total_per_file = [num_data_per_split] * days
for j in range(extras):
total_per_file[j] += 1
# split into days (simplifies code later on)
file_id = 0
boundary = total_per_file[file_id]
nf = open(npzfile + "_" + str(file_id), "w")
with open(str(datafile)) as f:
for j, line in enumerate(f):
if j == boundary:
nf.close()
file_id += 1
nf = open(npzfile + "_" + str(file_id), "w")
boundary += total_per_file[file_id]
nf.write(line)
nf.close()
else:
sys.exit(
"ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset"
)
else:
# WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
# Each line in the file is a sample, consisting of 13 continuous and
# 26 categorical features (an extra space indicates that feature is
# missing and will be interpreted as 0).
for i in range(days):
datafile_i = datafile + "_" + str(i) # + ".gz"
if path.exists(str(datafile_i)):
print("Reading data from path=%s" % (str(datafile_i)))
# file day_<number>
total_per_file_count = 0
with open(str(datafile_i)) as f:
for _ in f:
total_per_file_count += 1
total_per_file.append(total_per_file_count)
total_count += total_per_file_count
else:
sys.exit(
"ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs"
)
# process a file worth of data and reinitialize data
# note that a file main contain a single or multiple splits
def process_one_file(
datfile,
npzfile,
split,
num_data_in_split,
dataset_multiprocessing,
convertDictsDay=None,
resultDay=None,
):
if dataset_multiprocessing:
convertDicts_day = [{} for _ in range(26)]
with open(str(datfile)) as f:
y = np.zeros(num_data_in_split, dtype="i4") # 4 byte int
X_int = np.zeros((num_data_in_split, 13), dtype="i4") # 4 byte int
X_cat = np.zeros((num_data_in_split, 26), dtype="i4") # 4 byte int
if sub_sample_rate == 0.0:
rand_u = 1.0
else:
rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split)
i = 0
percent = 0
for k, line in enumerate(f):
# process a line (data point)
line = line.split("\t")
# set missing values to zero
for j in range(len(line)):
if (line[j] == "") or (line[j] == "\n"):
line[j] = "0"
# sub-sample data by dropping zero targets, if needed
target = np.int32(line[0])
if (
target == 0
and (rand_u if sub_sample_rate == 0.0 else rand_u[k])
< sub_sample_rate
):
continue
y[i] = target
X_int[i] = np.array(line[1:14], dtype=np.int32)
if max_ind_range > 0:
X_cat[i] = np.array(
list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
dtype=np.int32,
)
else:
X_cat[i] = np.array(
list(map(lambda x: int(x, 16), line[14:])), dtype=np.int32
)
# count uniques
if dataset_multiprocessing:
for j in range(26):
convertDicts_day[j][X_cat[i][j]] = 1
# debug prints
if float(i) / num_data_in_split * 100 > percent + 1:
percent = int(float(i) / num_data_in_split * 100)
print(
"Load %d/%d (%d%%) Split: %d Label True: %d Stored: %d"
% (
i,
num_data_in_split,
percent,
split,
target,
y[i],
),
end="\n",
)
else:
for j in range(26):
convertDicts[j][X_cat[i][j]] = 1
# debug prints
print(
"Load %d/%d Split: %d Label True: %d Stored: %d"
% (
i,
num_data_in_split,
split,
target,
y[i],
),
end="\r",
)
i += 1
# store num_data_in_split samples or extras at the end of file
# count uniques
# X_cat_t = np.transpose(X_cat)
# for j in range(26):
# for x in X_cat_t[j,:]:
# convertDicts[j][x] = 1
# store parsed
filename_s = npzfile + "_{0}.npz".format(split)
if path.exists(filename_s):
print("\nSkip existing " + filename_s)
else:
np.savez_compressed(
filename_s,
X_int=X_int[0:i, :],
# X_cat=X_cat[0:i, :],
X_cat_t=np.transpose(X_cat[0:i, :]), # transpose of the data
y=y[0:i],
)
print("\nSaved " + npzfile + "_{0}.npz!".format(split))
if dataset_multiprocessing:
resultDay[split] = i
convertDictsDay[split] = convertDicts_day
return
else:
return i
# create all splits (reuse existing files if possible)
recreate_flag = False
convertDicts = [{} for _ in range(26)]
# WARNING: to get reproducable sub-sampling results you must reset the seed below
# np.random.seed(123)
# in this case there is a single split in each day
for i in range(days):
npzfile_i = npzfile + "_{0}.npz".format(i)
npzfile_p = npzfile + "_{0}_processed.npz".format(i)
if path.exists(npzfile_i):
print("Skip existing " + npzfile_i)
elif path.exists(npzfile_p):
print("Skip existing " + npzfile_p)
else:
recreate_flag = True
if recreate_flag:
if dataset_multiprocessing:
resultDay = Manager().dict()
convertDictsDay = Manager().dict()
processes = [
Process(
target=process_one_file,
name="process_one_file:%i" % i,
args=(
npzfile + "_{0}".format(i),
npzfile,
i,
total_per_file[i],
dataset_multiprocessing,
convertDictsDay,
resultDay,
),
)
for i in range(0, days)
]
for process in processes:
process.start()
for process in processes:
process.join()
for day in range(days):
total_per_file[day] = resultDay[day]
print("Constructing convertDicts Split: {}".format(day))
convertDicts_tmp = convertDictsDay[day]
for i in range(26):
for j in convertDicts_tmp[i]:
convertDicts[i][j] = 1
else:
for i in range(days):
total_per_file[i] = process_one_file(
npzfile + "_{0}".format(i),
npzfile,
i,
total_per_file[i],
dataset_multiprocessing,
)
# report and save total into a file
total_count = np.sum(total_per_file)
if not path.exists(total_file):
np.savez_compressed(total_file, total_per_file=total_per_file)
print("Total number of samples:", total_count)
print("Divided into days/splits:\n", total_per_file)
# dictionary files
counts = np.zeros(26, dtype=np.int32)
if recreate_flag:
# create dictionaries
for j in range(26):
for i, x in enumerate(convertDicts[j]):
convertDicts[j][x] = i
dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j)
if not path.exists(dict_file_j):
np.savez_compressed(
dict_file_j, unique=np.array(list(convertDicts[j]), dtype=np.int32)
)
counts[j] = len(convertDicts[j])
# store (uniques and) counts
count_file = d_path + d_file + "_fea_count.npz"
if not path.exists(count_file):
np.savez_compressed(count_file, counts=counts)
else:
# create dictionaries (from existing files)
for j in range(26):
with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data:
unique = data["unique"]
for i, x in enumerate(unique):
convertDicts[j][x] = i
# load (uniques and) counts
with np.load(d_path + d_file + "_fea_count.npz") as data:
counts = data["counts"]
# process all splits
if dataset_multiprocessing:
processes = [
Process(
target=processCriteoAdData,
name="processCriteoAdData:%i" % i,
args=(
d_path,
d_file,
npzfile,
i,
convertDicts,
counts,
),
)
for i in range(0, days)
]
for process in processes:
process.start()
for process in processes:
process.join()
else:
for i in range(days):
processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts)
o_file = concatCriteoAdData(
d_path,
d_file,
npzfile,
trafile,
days,
data_split,
randomize,
total_per_file,
total_count,
memory_map,
o_filename,
)
return o_file
def loadDataset(
dataset,
max_ind_range,
sub_sample_rate,
randomize,
data_split,
raw_path="",
pro_data="",
memory_map=False,
):
# dataset
if dataset == "kaggle":
days = 7
o_filename = "kaggleAdDisplayChallenge_processed"
elif dataset == "terabyte":
days = 24
o_filename = "terabyte_processed"
else:
raise (ValueError("Data set option is not supported"))
# split the datafile into path and filename
lstr = raw_path.split("/")
d_path = "/".join(lstr[0:-1]) + "/"
d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
npzfile = (d_file + "_day") if dataset == "kaggle" else d_file
# trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
# check if pre-processed data is available
data_ready = True
if memory_map:
for i in range(days):
reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i)
if not path.exists(str(reo_data)):
data_ready = False
else:
if not path.exists(str(pro_data)):
data_ready = False
# pre-process data if needed
# WARNNING: when memory mapping is used we get a collection of files
if data_ready:
print("Reading pre-processed data=%s" % (str(pro_data)))
file = str(pro_data)
else:
print("Reading raw data=%s" % (str(raw_path)))
file = getCriteoAdData(
raw_path,
o_filename,
max_ind_range,
sub_sample_rate,
days,
data_split,
randomize,
dataset == "kaggle",
memory_map,
)
return file, days
if __name__ == "__main__":
### import packages ###
import argparse
### parse arguments ###
parser = argparse.ArgumentParser(description="Preprocess Criteo dataset")
# model related parameters
parser.add_argument("--max-ind-range", type=int, default=-1)
parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
parser.add_argument("--data-randomize", type=str, default="total") # or day or none
parser.add_argument("--memory-map", action="store_true", default=False)
parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
parser.add_argument("--raw-data-file", type=str, default="")
parser.add_argument("--processed-data-file", type=str, default="")
args = parser.parse_args()
loadDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
)
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: generate inputs and targets for the dlrm benchmark
# The inpts and outputs are generated according to the following three option(s)
# 1) random distribution
# 2) synthetic distribution, based on unique accesses and distances between them
# i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
# Simulation of Cache Memory", IEEE AINAM'07
# 3) public data set
# i) Criteo Kaggle Display Advertising Challenge Dataset
# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
# ii) Criteo Terabyte Dataset
# https://labs.criteo.com/2013/12/download-terabyte-click-logs
from __future__ import absolute_import, division, print_function, unicode_literals
import bisect
import collections
# others
# from os import path
import sys
import data_utils
# numpy
import numpy as np
# pytorch
import torch
from numpy import random as ra
from torch.utils.data import Dataset
# Kaggle Display Advertising Challenge Dataset
# dataset (str): name of dataset (Kaggle or Terabyte)
# randomize (str): determines randomization scheme
# 'none': no randomization
# 'day': randomizes each day's data (only works if split = True)
# 'total': randomizes total dataset
# split (bool) : to split into train, test, validation data-sets
class CriteoDatasetWMemoryMap(Dataset):
def __init__(
self,
dataset,
max_ind_range,
sub_sample_rate,
randomize,
split="train",
raw_path="",
pro_data="",
):
# dataset
# tar_fea = 1 # single target
den_fea = 13 # 13 dense features
# spa_fea = 26 # 26 sparse features
# tad_fea = tar_fea + den_fea
# tot_fea = tad_fea + spa_fea
if dataset == "kaggle":
days = 7
elif dataset == "terabyte":
days = 24
else:
raise (ValueError("Data set option is not supported"))
self.max_ind_range = max_ind_range
# split the datafile into path and filename
lstr = raw_path.split("/")
self.d_path = "/".join(lstr[0:-1]) + "/"
self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
self.npzfile = self.d_path + (
(self.d_file + "_day") if dataset == "kaggle" else self.d_file
)
self.trafile = self.d_path + (
(self.d_file + "_fea") if dataset == "kaggle" else "fea"
)
# get a number of samples per day
total_file = self.d_path + self.d_file + "_day_count.npz"
with np.load(total_file) as data:
total_per_file = data["total_per_file"]
# compute offsets per file
self.offset_per_file = np.array([0] + list(total_per_file))
for i in range(days):
self.offset_per_file[i + 1] += self.offset_per_file[i]
# print(self.offset_per_file)
# setup data
self.split = split
if split == "none" or split == "train":
self.day = 0
self.max_day_range = days if split == "none" else days - 1
elif split == "test" or split == "val":
self.day = days - 1
num_samples = self.offset_per_file[days] - self.offset_per_file[days - 1]
self.test_size = int(np.ceil(num_samples / 2.0))
self.val_size = num_samples - self.test_size
else:
sys.exit("ERROR: dataset split is neither none, nor train or test.")
# load unique counts
with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
self.counts = data["counts"]
self.m_den = den_fea # X_int.shape[1]
self.n_emb = len(self.counts)
print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
# Load the test data
# Only a single day is used for testing
if self.split == "test" or self.split == "val":
# only a single day is used for testing
fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
with np.load(fi) as data:
self.X_int = data["X_int"] # continuous feature
self.X_cat = data["X_cat"] # categorical feature
self.y = data["y"] # target
def __getitem__(self, index):
if isinstance(index, slice):
return [
self[idx]
for idx in range(
index.start or 0, index.stop or len(self), index.step or 1
)
]
if self.split == "none" or self.split == "train":
# check if need to swicth to next day and load data
if index == self.offset_per_file[self.day]:
# print("day_boundary switch", index)
self.day_boundary = self.offset_per_file[self.day]
fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
# print('Loading file: ', fi)
with np.load(fi) as data:
self.X_int = data["X_int"] # continuous feature
self.X_cat = data["X_cat"] # categorical feature
self.y = data["y"] # target
self.day = (self.day + 1) % self.max_day_range
i = index - self.day_boundary
elif self.split == "test" or self.split == "val":
# only a single day is used for testing
i = index + (0 if self.split == "test" else self.test_size)
else:
sys.exit("ERROR: dataset split is neither none, nor train or test.")
if self.max_ind_range > 0:
return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
else:
return self.X_int[i], self.X_cat[i], self.y[i]
def _default_preprocess(self, X_int, X_cat, y):
X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
if self.max_ind_range > 0:
X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
else:
X_cat = torch.tensor(X_cat, dtype=torch.long)
y = torch.tensor(y.astype(np.float32))
return X_int, X_cat, y
def __len__(self):
if self.split == "none":
return self.offset_per_file[-1]
elif self.split == "train":
return self.offset_per_file[-2]
elif self.split == "test":
return self.test_size
elif self.split == "val":
return self.val_size
else:
sys.exit("ERROR: dataset split is neither none, nor train nor test.")
def collate_wrapper_criteo(list_of_tuples):
# where each tuple is (X_int, X_cat, y)
transposed_data = list(zip(*list_of_tuples))
X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
batchSize = X_cat.shape[0]
featureCnt = X_cat.shape[1]
lS_i = [X_cat[:, i] for i in range(featureCnt)]
lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
return X_int, torch.stack(lS_o), torch.stack(lS_i), T
# Conversion from offset to length
def offset_to_length_convertor(lS_o, lS_i):
def diff(tensor):
return tensor[1:] - tensor[:-1]
return torch.stack(
[
diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
for ind, S_o in enumerate(lS_o)
]
)
def unpack_batch(b, data_gen, data_set):
return b[0], b[1], b[2], b[3], torch.ones(b[3].size())
def read_dataset(
dataset,
max_ind_range,
sub_sample_rate,
mini_batch_size,
num_batches,
randomize,
split="train",
raw_data="",
processed_data="",
memory_map=False,
inference_only=False,
test_mini_batch_size=1,
):
# split the datafile into path and filename
lstr = raw_data.split("/")
d_path = "/".join(lstr[0:-1]) + "/"
d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
# npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
# trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
# load
print("Loading %s dataset..." % dataset)
nbatches = 0
file, days = data_utils.loadDataset(
dataset,
max_ind_range,
sub_sample_rate,
randomize,
split,
raw_data,
processed_data,
memory_map,
)
if memory_map:
# WARNING: at this point the data has been reordered and shuffled across files
# e.g. day_<number>_reordered.npz, what remains is simply to read and feed
# the data from each file, going in the order of days file-by-file, to the
# model during training.
train_data = CriteoDatasetWMemoryMap(
dataset,
max_ind_range,
sub_sample_rate,
randomize,
"train",
raw_data,
processed_data,
)
test_data = CriteoDatasetWMemoryMap(
dataset,
max_ind_range,
sub_sample_rate,
randomize,
"test",
raw_data,
processed_data,
)
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=mini_batch_size,
shuffle=False,
num_workers=0,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False, # True
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=test_mini_batch_size,
shuffle=False,
num_workers=0,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False, # True
)
return train_data, train_loader, test_data, test_loader
else:
# load and preprocess data
with np.load(file) as data:
X_int = data["X_int"]
X_cat = data["X_cat"]
y = data["y"]
counts = data["counts"]
# get a number of samples per day
total_file = d_path + d_file + "_day_count.npz"
with np.load(total_file) as data:
total_per_file = data["total_per_file"]
# transform
(
X_cat_train,
X_int_train,
y_train,
X_cat_val,
X_int_val,
y_val,
X_cat_test,
X_int_test,
y_test,
) = data_utils.transformCriteoAdData(
X_cat, X_int, y, days, split, randomize, total_per_file
)
ln_emb = counts
m_den = X_int_train.shape[1]
n_emb = len(counts)
print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))
# adjust parameters
def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
if max_ind_range > 0:
X_cat = X_cat % max_ind_range
nsamples = len(y)
data_size = nsamples
# using floor is equivalent to dropping last mini-batch (drop_last = True)
nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
print(print_message)
if num_batches != 0 and num_batches < nbatches:
print(
"Limiting to %d batches of the total % d batches"
% (num_batches, nbatches)
)
nbatches = num_batches
else:
print("Total number of batches %d" % nbatches)
# data main loop
lX = []
lS_lengths = []
lS_indices = []
lT = []
for j in range(0, nbatches):
# number of data points in a batch
print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# dense feature
idx_start = j * mini_batch_size
lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
# Targets - outputs
lT.append(
(y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
)
# sparse feature (sparse indices)
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in range(n_emb):
lS_batch_indices = []
for _b in range(n):
# num of sparse indices to be used per embedding, e.g. for
# store lengths and indices
lS_batch_indices += (
(X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
).tolist()
lS_emb_indices.append(lS_batch_indices)
lS_indices.append(lS_emb_indices)
# Criteo Kaggle data it is 1 because data is categorical
lS_lengths.append(
[(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
)
print("\n")
return nbatches, lX, lS_lengths, lS_indices, lT
# adjust training data
(nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples(
X_cat_train, X_int_train, y_train, max_ind_range, "Training data"
)
# adjust testing data
(nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples(
X_cat_test, X_int_test, y_test, max_ind_range, "Testing data"
)
# end if memory_map
return (
nbatches,
lX,
lS_lengths,
lS_indices,
lT,
nbatches_t,
lX_t,
lS_lengths_t,
lS_indices_t,
lT_t,
ln_emb,
m_den,
)
def generate_random_data(
m_den,
ln_emb,
data_size,
num_batches,
mini_batch_size,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
num_targets=1,
round_targets=False,
data_generation="random",
trace_file="",
enable_padding=False,
):
nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
if num_batches != 0:
nbatches = num_batches
data_size = nbatches * mini_batch_size
# print("Total number of batches %d" % nbatches)
# inputs and targets
lT = []
lX = []
lS_lengths = []
lS_indices = []
for j in range(0, nbatches):
# number of data points in a batch
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# generate a batch of dense and sparse features
if data_generation == "random":
(Xt, lS_emb_lengths, lS_emb_indices) = generate_uniform_input_batch(
m_den, ln_emb, n, num_indices_per_lookup, num_indices_per_lookup_fixed
)
elif data_generation == "synthetic":
(Xt, lS_emb_lengths, lS_emb_indices) = generate_synthetic_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
trace_file,
enable_padding,
)
else:
sys.exit(
"ERROR: --data-generation=" + data_generation + " is not supported"
)
# dense feature
lX.append(Xt)
# sparse feature (sparse indices)
lS_lengths.append(lS_emb_lengths)
lS_indices.append(lS_emb_indices)
# generate a batch of target (probability of a click)
P = generate_random_output_batch(n, num_targets, round_targets)
lT.append(P)
return (nbatches, lX, lS_lengths, lS_indices, lT)
def generate_random_output_batch(n, num_targets=1, round_targets=False):
# target (probability of a click)
if round_targets:
P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.int32)
else:
P = ra.rand(n, num_targets).astype(np.float32)
return P
# uniform ditribution (input data)
def generate_uniform_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
):
# dense feature
Xt = ra.rand(n, m_den).astype(np.float32)
# sparse feature (sparse indices)
lS_emb_lengths = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in ln_emb:
lS_batch_lengths = []
lS_batch_indices = []
for _ in range(n):
# num of sparse indices to be used per embedding (between
if num_indices_per_lookup_fixed:
sparse_group_size = np.int32(num_indices_per_lookup)
else:
# random between [1,num_indices_per_lookup])
r = ra.random(1)
sparse_group_size = np.int32(
max(1, np.round(r * min(size, num_indices_per_lookup))[0])
)
# sparse indices to be used per embedding
r = ra.random(sparse_group_size)
sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int32))
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int32(sparse_group.size)
# store lengths and indices
lS_batch_lengths += [sparse_group_size]
lS_batch_indices += sparse_group.tolist()
lS_emb_lengths.append(lS_batch_lengths)
lS_emb_indices.append(lS_batch_indices)
return (Xt, lS_emb_lengths, lS_emb_indices)
# synthetic distribution (input data)
def generate_synthetic_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
trace_file,
enable_padding=False,
):
# dense feature
Xt = ra.rand(n, m_den).astype(np.float32)
# sparse feature (sparse indices)
lS_emb_lengths = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for i, size in enumerate(ln_emb):
lS_batch_lengths = []
lS_batch_indices = []
for _ in range(n):
# num of sparse indices to be used per embedding (between
if num_indices_per_lookup_fixed:
sparse_group_size = np.int32(num_indices_per_lookup)
else:
# random between [1,num_indices_per_lookup])
r = ra.random(1)
sparse_group_size = np.int32(
max(1, np.round(r * min(size, num_indices_per_lookup))[0])
)
# sparse indices to be used per embedding
file_path = trace_file
line_accesses, list_sd, cumm_sd = read_dist_from_file(
file_path.replace("j", str(i))
)
# debug print
# print('input')
# print(line_accesses); print(list_sd); print(cumm_sd);
# print(sparse_group_size)
# approach 1: rand
# r = trace_generate_rand(
# line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
# )
# approach 2: lru
r = trace_generate_lru(
line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
)
# WARNING: if the distribution in the file is not consistent with
# embedding table dimensions, below mod guards against out of
# range access
sparse_group = np.unique(r).astype(np.int32)
minsg = np.min(sparse_group)
maxsg = np.max(sparse_group)
if (minsg < 0) or (size <= maxsg):
print(
"WARNING: distribution is inconsistent with embedding "
+ "table size (using mod to recover and continue)"
)
sparse_group = np.mod(sparse_group, size).astype(np.int32)
# sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int32))
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int32(sparse_group.size)
# store lengths and indices
lS_batch_lengths += [sparse_group_size]
lS_batch_indices += sparse_group.tolist()
lS_emb_lengths.append(lS_batch_lengths)
lS_emb_indices.append(lS_batch_indices)
return (Xt, lS_emb_lengths, lS_emb_indices)
def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
u = ra.rand(1)
if i < max_i:
# only generate stack distances up to the number of new references seen so far
j = bisect.bisect(cumm_val, i) - 1
fi = cumm_dist[j]
u *= fi # shrink distribution support to exclude last values
elif enable_padding:
# WARNING: disable generation of new references (once all have been seen)
fi = cumm_dist[0]
u = (1.0 - fi) * u + fi # remap distribution support to exclude first value
for j, f in enumerate(cumm_dist):
if u <= f:
return cumm_val[j]
# WARNING: global define, must be consistent across all synthetic functions
cache_line_size = 1
def trace_generate_lru(
line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
):
max_sd = list_sd[-1]
l = len(line_accesses)
i = 0
ztrace = []
for _ in range(out_trace_len):
sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
# generate memory reference
if sd == 0: # new reference #
line_ref = line_accesses.pop(0)
line_accesses.append(line_ref)
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
i += 1
else: # existing reference #
line_ref = line_accesses[l - sd]
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
line_accesses.pop(l - sd)
line_accesses.append(line_ref)
# save generated memory reference
ztrace.append(mem_ref)
return ztrace
def trace_generate_rand(
line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
):
max_sd = list_sd[-1]
l = len(line_accesses) # !!!Unique,
i = 0
ztrace = []
for _ in range(out_trace_len):
sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
# generate memory reference
if sd == 0: # new reference #
line_ref = line_accesses.pop(0)
line_accesses.append(line_ref)
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
i += 1
else: # existing reference #
line_ref = line_accesses[l - sd]
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
ztrace.append(mem_ref)
return ztrace
def trace_profile(trace, enable_padding=False):
# number of elements in the array (assuming 1D)
# n = trace.size
rstack = [] # S
stack_distances = [] # SDS
line_accesses = [] # L
for x in trace:
r = np.uint64(x / cache_line_size)
l = len(rstack)
try: # found #
i = rstack.index(r)
# WARNING: I believe below is the correct depth in terms of meaning of the
# algorithm, but that is not what seems to be in the paper alg.
# -1 can be subtracted if we defined the distance between
# consecutive accesses (e.g. r, r) as 0 rather than 1.
sd = l - i # - 1
# push r to the end of stack_distances
stack_distances.insert(0, sd)
# remove r from its position and insert to the top of stack
rstack.pop(i) # rstack.remove(r)
rstack.insert(l - 1, r)
except ValueError: # not found #
sd = 0 # -1
# push r to the end of stack_distances/line_accesses
stack_distances.insert(0, sd)
line_accesses.insert(0, r)
# push r to the top of stack
rstack.insert(l, r)
if enable_padding:
# WARNING: notice that as the ratio between the number of samples (l)
# and cardinality [c] of a sample increases the probability of
# generating a sample gets smaller and smaller because there are
# few new samples compared to repeated samples. This means that for a
# long trace with relatively small cardinality it will take longer to
# generate all new samples and therefore obtain full distribution support
# and hence it takes longer for distribution to resemble the original.
# Therefore, we may pad the number of new samples to be on par with
# average number of samples l/c artificially.
l = len(stack_distances)
c = max(stack_distances)
padding = int(np.ceil(l / c))
stack_distances = stack_distances + [0] * padding
return (rstack, stack_distances, line_accesses)
# auxiliary read/write routines
def read_trace_from_file(file_path):
try:
with open(file_path) as f:
if args.trace_file_binary_type:
array = np.fromfile(f, dtype=np.uint64)
trace = array.astype(np.uint64).tolist()
else:
line = f.readline()
trace = list(map(lambda x: np.uint64(x), line.split(", ")))
return trace
except Exception:
print("ERROR: no input trace file has been provided")
def write_trace_to_file(file_path, trace):
try:
if args.trace_file_binary_type:
with open(file_path, "wb+") as f:
np.array(trace).astype(np.uint64).tofile(f)
else:
with open(file_path, "w+") as f:
s = str(trace)
f.write(s[1 : len(s) - 1])
except Exception:
print("ERROR: no output trace file has been provided")
def read_dist_from_file(file_path):
try:
with open(file_path, "r") as f:
lines = f.read().splitlines()
except Exception:
print("Wrong file or file path")
# read unique accesses
unique_accesses = [int(el) for el in lines[0].split(", ")]
# read cumulative distribution (elements are passed as two separate lists)
list_sd = [int(el) for el in lines[1].split(", ")]
cumm_sd = [float(el) for el in lines[2].split(", ")]
return unique_accesses, list_sd, cumm_sd
def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
try:
with open(file_path, "w") as f:
# unique_acesses
s = str(unique_accesses)
f.write(s[1 : len(s) - 1] + "\n")
# list_sd
s = str(list_sd)
f.write(s[1 : len(s) - 1] + "\n")
# cumm_sd
s = str(cumm_sd)
f.write(s[1 : len(s) - 1] + "\n")
except Exception:
print("Wrong file or file path")
if __name__ == "__main__":
import argparse
import operator
import sys
### parse arguments ###
parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
parser.add_argument("--trace-file", type=str, default="./input/trace.log")
parser.add_argument("--trace-file-binary-type", type=bool, default=False)
parser.add_argument("--trace-enable-padding", type=bool, default=False)
parser.add_argument("--dist-file", type=str, default="./input/dist.log")
parser.add_argument(
"--synthetic-file", type=str, default="./input/trace_synthetic.log"
)
parser.add_argument("--numpy-rand-seed", type=int, default=123)
parser.add_argument("--print-precision", type=int, default=5)
args = parser.parse_args()
### some basic setup ###
np.random.seed(args.numpy_rand_seed)
np.set_printoptions(precision=args.print_precision)
### read trace ###
trace = read_trace_from_file(args.trace_file)
# print(trace)
### profile trace ###
(_, stack_distances, line_accesses) = trace_profile(
trace, args.trace_enable_padding
)
stack_distances.reverse()
line_accesses.reverse()
# print(line_accesses)
# print(stack_distances)
### compute probability distribution ###
# count items
l = len(stack_distances)
dc = sorted(
collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
)
# create a distribution
list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0]
dist_sd = list(
map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
) # k = tuple_x_k[1]
cumm_sd = [] # np.cumsum(dc).tolist() #prefixsum
for i, (_, k) in enumerate(dc):
if i == 0:
cumm_sd.append(k / float(l))
else:
# add the 2nd element of the i-th tuple in the dist_sd list
cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
### write stack_distance and line_accesses to a file ###
write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
### generate correspondinf synthetic ###
# line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
synthetic_trace = trace_generate_lru(
line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
)
# synthetic_trace = trace_generate_rand(
# line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
# )
write_trace_to_file(args.synthetic_file, synthetic_trace)
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: generate inputs and targets for the dlrm benchmark
# The inputs and outputs are generated according to the following three option(s)
# 1) random distribution
# 2) synthetic distribution, based on unique accesses and distances between them
# i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
# Simulation of Cache Memory", IEEE AINAM'07
# 3) public data set
# i) Criteo Kaggle Display Advertising Challenge Dataset
# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
# ii) Criteo Terabyte Dataset
# https://labs.criteo.com/2013/12/download-terabyte-click-logs
from __future__ import absolute_import, division, print_function, unicode_literals
import bisect
import collections
import sys
from collections import deque
# others
from os import path
import data_loader_terabyte
import data_utils
import mlperf_logger
# numpy
import numpy as np
# pytorch
import torch
from numpy import random as ra
from torch.utils.data import Dataset, RandomSampler
# Kaggle Display Advertising Challenge Dataset
# dataset (str): name of dataset (Kaggle or Terabyte)
# randomize (str): determines randomization scheme
# "none": no randomization
# "day": randomizes each day"s data (only works if split = True)
# "total": randomizes total dataset
# split (bool) : to split into train, test, validation data-sets
class CriteoDataset(Dataset):
def __init__(
self,
dataset,
max_ind_range,
sub_sample_rate,
randomize,
split="train",
raw_path="",
pro_data="",
memory_map=False,
dataset_multiprocessing=False,
):
# dataset
# tar_fea = 1 # single target
den_fea = 13 # 13 dense features
# spa_fea = 26 # 26 sparse features
# tad_fea = tar_fea + den_fea
# tot_fea = tad_fea + spa_fea
if dataset == "kaggle":
days = 7
out_file = "kaggleAdDisplayChallenge_processed"
elif dataset == "terabyte":
days = 24
out_file = "terabyte_processed"
else:
raise (ValueError("Data set option is not supported"))
self.max_ind_range = max_ind_range
self.memory_map = memory_map
# split the datafile into path and filename
lstr = raw_path.split("/")
self.d_path = "/".join(lstr[0:-1]) + "/"
self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
self.npzfile = self.d_path + (
(self.d_file + "_day") if dataset == "kaggle" else self.d_file
)
self.trafile = self.d_path + (
(self.d_file + "_fea") if dataset == "kaggle" else "fea"
)
# check if pre-processed data is available
data_ready = True
if memory_map:
for i in range(days):
reo_data = self.npzfile + "_{0}_reordered.npz".format(i)
if not path.exists(str(reo_data)):
data_ready = False
else:
if not path.exists(str(pro_data)):
data_ready = False
# pre-process data if needed
# WARNNING: when memory mapping is used we get a collection of files
if data_ready:
print("Reading pre-processed data=%s" % (str(pro_data)))
file = str(pro_data)
else:
print("Reading raw data=%s" % (str(raw_path)))
file = data_utils.getCriteoAdData(
raw_path,
out_file,
max_ind_range,
sub_sample_rate,
days,
split,
randomize,
dataset == "kaggle",
memory_map,
dataset_multiprocessing,
)
# get a number of samples per day
total_file = self.d_path + self.d_file + "_day_count.npz"
with np.load(total_file) as data:
total_per_file = data["total_per_file"]
# compute offsets per file
self.offset_per_file = np.array([0] + [x for x in total_per_file])
for i in range(days):
self.offset_per_file[i + 1] += self.offset_per_file[i]
# print(self.offset_per_file)
# setup data
if memory_map:
# setup the training/testing split
self.split = split
if split == "none" or split == "train":
self.day = 0
self.max_day_range = days if split == "none" else days - 1
elif split == "test" or split == "val":
self.day = days - 1
num_samples = (
self.offset_per_file[days] - self.offset_per_file[days - 1]
)
self.test_size = int(np.ceil(num_samples / 2.0))
self.val_size = num_samples - self.test_size
else:
sys.exit("ERROR: dataset split is neither none, nor train or test.")
"""
# text
print("text")
for i in range(days):
fi = self.npzfile + "_{0}".format(i)
with open(fi) as data:
ttt = 0; nnn = 0
for _j, line in enumerate(data):
ttt +=1
if np.int32(line[0]) > 0:
nnn +=1
print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+ str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
# processed
print("processed")
for i in range(days):
fi = self.npzfile + "_{0}_processed.npz".format(i)
with np.load(fi) as data:
yyy = data["y"]
ttt = len(yyy)
nnn = np.count_nonzero(yyy)
print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+ str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
# reordered
print("reordered")
for i in range(days):
fi = self.npzfile + "_{0}_reordered.npz".format(i)
with np.load(fi) as data:
yyy = data["y"]
ttt = len(yyy)
nnn = np.count_nonzero(yyy)
print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+ str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
"""
# load unique counts
with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
self.counts = data["counts"]
self.m_den = den_fea # X_int.shape[1]
self.n_emb = len(self.counts)
print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
# Load the test data
# Only a single day is used for testing
if self.split == "test" or self.split == "val":
# only a single day is used for testing
fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
with np.load(fi) as data:
self.X_int = data["X_int"] # continuous feature
self.X_cat = data["X_cat"] # categorical feature
self.y = data["y"] # target
else:
# load and preprocess data
with np.load(file) as data:
X_int = data["X_int"] # continuous feature
X_cat = data["X_cat"] # categorical feature
y = data["y"] # target
self.counts = data["counts"]
self.m_den = X_int.shape[1] # den_fea
self.n_emb = len(self.counts)
print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den))
# create reordering
indices = np.arange(len(y))
if split == "none":
# randomize all data
if randomize == "total":
indices = np.random.permutation(indices)
print("Randomized indices...")
X_int[indices] = X_int
X_cat[indices] = X_cat
y[indices] = y
else:
indices = np.array_split(indices, self.offset_per_file[1:-1])
# randomize train data (per day)
if randomize == "day": # or randomize == "total":
for i in range(len(indices) - 1):
indices[i] = np.random.permutation(indices[i])
print("Randomized indices per day ...")
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
test_indices, val_indices = np.array_split(test_indices, 2)
print("Defined %s indices..." % (split))
# randomize train data (across days)
if randomize == "total":
train_indices = np.random.permutation(train_indices)
print("Randomized indices across days ...")
# create training, validation, and test sets
if split == "train":
self.X_int = [X_int[i] for i in train_indices]
self.X_cat = [X_cat[i] for i in train_indices]
self.y = [y[i] for i in train_indices]
elif split == "val":
self.X_int = [X_int[i] for i in val_indices]
self.X_cat = [X_cat[i] for i in val_indices]
self.y = [y[i] for i in val_indices]
elif split == "test":
self.X_int = [X_int[i] for i in test_indices]
self.X_cat = [X_cat[i] for i in test_indices]
self.y = [y[i] for i in test_indices]
print("Split data according to indices...")
def __getitem__(self, index):
if isinstance(index, slice):
return [
self[idx]
for idx in range(
index.start or 0, index.stop or len(self), index.step or 1
)
]
if self.memory_map:
if self.split == "none" or self.split == "train":
# check if need to swicth to next day and load data
if index == self.offset_per_file[self.day]:
# print("day_boundary switch", index)
self.day_boundary = self.offset_per_file[self.day]
fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
# print('Loading file: ', fi)
with np.load(fi) as data:
self.X_int = data["X_int"] # continuous feature
self.X_cat = data["X_cat"] # categorical feature
self.y = data["y"] # target
self.day = (self.day + 1) % self.max_day_range
i = index - self.day_boundary
elif self.split == "test" or self.split == "val":
# only a single day is used for testing
i = index + (0 if self.split == "test" else self.test_size)
else:
sys.exit("ERROR: dataset split is neither none, nor train or test.")
else:
i = index
if self.max_ind_range > 0:
return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
else:
return self.X_int[i], self.X_cat[i], self.y[i]
def _default_preprocess(self, X_int, X_cat, y):
X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
if self.max_ind_range > 0:
X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
else:
X_cat = torch.tensor(X_cat, dtype=torch.long)
y = torch.tensor(y.astype(np.float32))
return X_int, X_cat, y
def __len__(self):
if self.memory_map:
if self.split == "none":
return self.offset_per_file[-1]
elif self.split == "train":
return self.offset_per_file[-2]
elif self.split == "test":
return self.test_size
elif self.split == "val":
return self.val_size
else:
sys.exit("ERROR: dataset split is neither none, nor train nor test.")
else:
return len(self.y)
def collate_wrapper_criteo_offset(list_of_tuples):
# where each tuple is (X_int, X_cat, y)
transposed_data = list(zip(*list_of_tuples))
X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
batchSize = X_cat.shape[0]
featureCnt = X_cat.shape[1]
lS_i = [X_cat[:, i] for i in range(featureCnt)]
lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
return X_int, torch.stack(lS_o), torch.stack(lS_i), T
def ensure_dataset_preprocessed(args, d_path):
_ = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
_ = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"test",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
for split in ["train", "val", "test"]:
print("Running preprocessing for split =", split)
train_files = [
"{}_{}_reordered.npz".format(args.raw_data_file, day)
for day in range(0, 23)
]
test_valid_file = args.raw_data_file + "_23_reordered.npz"
output_file = d_path + "_{}.bin".format(split)
input_files = train_files if split == "train" else [test_valid_file]
data_loader_terabyte.numpy_to_binary(
input_files=input_files, output_file_path=output_file, split=split
)
# Conversion from offset to length
def offset_to_length_converter(lS_o, lS_i):
def diff(tensor):
return tensor[1:] - tensor[:-1]
return torch.stack(
[
diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
for ind, S_o in enumerate(lS_o)
]
)
def collate_wrapper_criteo_length(list_of_tuples):
# where each tuple is (X_int, X_cat, y)
transposed_data = list(zip(*list_of_tuples))
X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
batchSize = X_cat.shape[0]
featureCnt = X_cat.shape[1]
lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)])
lS_o = torch.stack([torch.tensor(range(batchSize)) for _ in range(featureCnt)])
lS_l = offset_to_length_converter(lS_o, lS_i)
return X_int, lS_l, lS_i, T
def make_criteo_data_and_loaders(args, offset_to_length_converter=False):
if args.mlperf_logging and args.memory_map and args.data_set == "terabyte":
# more efficient for larger batches
data_directory = path.dirname(args.raw_data_file)
if args.mlperf_bin_loader:
lstr = args.processed_data_file.split("/")
d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
train_file = d_path + "_train.bin"
test_file = d_path + "_test.bin"
# val_file = d_path + "_val.bin"
counts_file = args.raw_data_file + "_fea_count.npz"
if any(not path.exists(p) for p in [train_file, test_file, counts_file]):
ensure_dataset_preprocessed(args, d_path)
train_data = data_loader_terabyte.CriteoBinDataset(
data_file=train_file,
counts_file=counts_file,
batch_size=args.mini_batch_size,
max_ind_range=args.max_ind_range,
)
mlperf_logger.log_event(
key=mlperf_logger.constants.TRAIN_SAMPLES, value=train_data.num_samples
)
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=None,
batch_sampler=None,
shuffle=False,
num_workers=0,
collate_fn=None,
pin_memory=False,
drop_last=False,
sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None,
)
test_data = data_loader_terabyte.CriteoBinDataset(
data_file=test_file,
counts_file=counts_file,
batch_size=args.test_mini_batch_size,
max_ind_range=args.max_ind_range,
)
mlperf_logger.log_event(
key=mlperf_logger.constants.EVAL_SAMPLES, value=test_data.num_samples
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=None,
batch_sampler=None,
shuffle=False,
num_workers=0,
collate_fn=None,
pin_memory=False,
drop_last=False,
)
else:
data_filename = args.raw_data_file.split("/")[-1]
train_data = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
test_data = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"test",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
train_loader = data_loader_terabyte.DataLoader(
data_directory=data_directory,
data_filename=data_filename,
days=list(range(23)),
batch_size=args.mini_batch_size,
max_ind_range=args.max_ind_range,
split="train",
)
test_loader = data_loader_terabyte.DataLoader(
data_directory=data_directory,
data_filename=data_filename,
days=[23],
batch_size=args.test_mini_batch_size,
max_ind_range=args.max_ind_range,
split="test",
)
else:
train_data = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"train",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
test_data = CriteoDataset(
args.data_set,
args.max_ind_range,
args.data_sub_sample_rate,
args.data_randomize,
"test",
args.raw_data_file,
args.processed_data_file,
args.memory_map,
args.dataset_multiprocessing,
)
collate_wrapper_criteo = collate_wrapper_criteo_offset
if offset_to_length_converter:
collate_wrapper_criteo = collate_wrapper_criteo_length
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=args.mini_batch_size,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False, # True
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=args.test_mini_batch_size,
shuffle=False,
num_workers=args.test_num_workers,
collate_fn=collate_wrapper_criteo,
pin_memory=False,
drop_last=False, # True
)
return train_data, train_loader, test_data, test_loader
# uniform ditribution (input data)
class RandomDataset(Dataset):
def __init__(
self,
m_den,
ln_emb,
data_size,
num_batches,
mini_batch_size,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
num_targets=1,
round_targets=False,
data_generation="random",
trace_file="",
enable_padding=False,
reset_seed_on_access=False,
rand_data_dist="uniform",
rand_data_min=1,
rand_data_max=1,
rand_data_mu=-1,
rand_data_sigma=1,
rand_seed=0,
):
# compute batch size
nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
if num_batches != 0:
nbatches = num_batches
data_size = nbatches * mini_batch_size
# print("Total number of batches %d" % nbatches)
# save args (recompute data_size if needed)
self.m_den = m_den
self.ln_emb = ln_emb
self.data_size = data_size
self.num_batches = nbatches
self.mini_batch_size = mini_batch_size
self.num_indices_per_lookup = num_indices_per_lookup
self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed
self.num_targets = num_targets
self.round_targets = round_targets
self.data_generation = data_generation
self.trace_file = trace_file
self.enable_padding = enable_padding
self.reset_seed_on_access = reset_seed_on_access
self.rand_seed = rand_seed
self.rand_data_dist = rand_data_dist
self.rand_data_min = rand_data_min
self.rand_data_max = rand_data_max
self.rand_data_mu = rand_data_mu
self.rand_data_sigma = rand_data_sigma
def reset_numpy_seed(self, numpy_rand_seed):
np.random.seed(numpy_rand_seed)
# torch.manual_seed(numpy_rand_seed)
def __getitem__(self, index):
if isinstance(index, slice):
return [
self[idx]
for idx in range(
index.start or 0, index.stop or len(self), index.step or 1
)
]
# WARNING: reset seed on access to first element
# (e.g. if same random samples needed across epochs)
if self.reset_seed_on_access and index == 0:
self.reset_numpy_seed(self.rand_seed)
# number of data points in a batch
n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size))
# generate a batch of dense and sparse features
if self.data_generation == "random":
(X, lS_o, lS_i) = generate_dist_input_batch(
self.m_den,
self.ln_emb,
n,
self.num_indices_per_lookup,
self.num_indices_per_lookup_fixed,
rand_data_dist=self.rand_data_dist,
rand_data_min=self.rand_data_min,
rand_data_max=self.rand_data_max,
rand_data_mu=self.rand_data_mu,
rand_data_sigma=self.rand_data_sigma,
)
elif self.data_generation == "synthetic":
(X, lS_o, lS_i) = generate_synthetic_input_batch(
self.m_den,
self.ln_emb,
n,
self.num_indices_per_lookup,
self.num_indices_per_lookup_fixed,
self.trace_file,
self.enable_padding,
)
else:
sys.exit(
"ERROR: --data-generation=" + self.data_generation + " is not supported"
)
# generate a batch of target (probability of a click)
T = generate_random_output_batch(n, self.num_targets, self.round_targets)
return (X, lS_o, lS_i, T)
def __len__(self):
# WARNING: note that we produce bacthes of outputs in __getitem__
# therefore we should use num_batches rather than data_size below
return self.num_batches
def collate_wrapper_random_offset(list_of_tuples):
# where each tuple is (X, lS_o, lS_i, T)
(X, lS_o, lS_i, T) = list_of_tuples[0]
return (X, torch.stack(lS_o), lS_i, T)
def collate_wrapper_random_length(list_of_tuples):
# where each tuple is (X, lS_o, lS_i, T)
(X, lS_o, lS_i, T) = list_of_tuples[0]
return (X, offset_to_length_converter(torch.stack(lS_o), lS_i), lS_i, T)
def make_random_data_and_loader(
args,
ln_emb,
m_den,
offset_to_length_converter=False,
):
train_data = RandomDataset(
m_den,
ln_emb,
args.data_size,
args.num_batches,
args.mini_batch_size,
args.num_indices_per_lookup,
args.num_indices_per_lookup_fixed,
1, # num_targets
args.round_targets,
args.data_generation,
args.data_trace_file,
args.data_trace_enable_padding,
reset_seed_on_access=True,
rand_data_dist=args.rand_data_dist,
rand_data_min=args.rand_data_min,
rand_data_max=args.rand_data_max,
rand_data_mu=args.rand_data_mu,
rand_data_sigma=args.rand_data_sigma,
rand_seed=args.numpy_rand_seed,
) # WARNING: generates a batch of lookups at once
test_data = RandomDataset(
m_den,
ln_emb,
args.data_size,
args.num_batches,
args.mini_batch_size,
args.num_indices_per_lookup,
args.num_indices_per_lookup_fixed,
1, # num_targets
args.round_targets,
args.data_generation,
args.data_trace_file,
args.data_trace_enable_padding,
reset_seed_on_access=True,
rand_data_dist=args.rand_data_dist,
rand_data_min=args.rand_data_min,
rand_data_max=args.rand_data_max,
rand_data_mu=args.rand_data_mu,
rand_data_sigma=args.rand_data_sigma,
rand_seed=args.numpy_rand_seed,
)
collate_wrapper_random = collate_wrapper_random_offset
if offset_to_length_converter:
collate_wrapper_random = collate_wrapper_random_length
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=1,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_random,
pin_memory=False,
drop_last=False, # True
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=1,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_random,
pin_memory=False,
drop_last=False, # True
)
return train_data, train_loader, test_data, test_loader
def generate_random_data(
m_den,
ln_emb,
data_size,
num_batches,
mini_batch_size,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
num_targets=1,
round_targets=False,
data_generation="random",
trace_file="",
enable_padding=False,
length=False, # length for caffe2 version (except dlrm_s_caffe2)
):
nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
if num_batches != 0:
nbatches = num_batches
data_size = nbatches * mini_batch_size
# print("Total number of batches %d" % nbatches)
# inputs
lT = []
lX = []
lS_offsets = []
lS_indices = []
for j in range(0, nbatches):
# number of data points in a batch
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# generate a batch of dense and sparse features
if data_generation == "random":
(Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
length,
)
elif data_generation == "synthetic":
(Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
trace_file,
enable_padding,
)
else:
sys.exit(
"ERROR: --data-generation=" + data_generation + " is not supported"
)
# dense feature
lX.append(Xt)
# sparse feature (sparse indices)
lS_offsets.append(lS_emb_offsets)
lS_indices.append(lS_emb_indices)
# generate a batch of target (probability of a click)
P = generate_random_output_batch(n, num_targets, round_targets)
lT.append(P)
return (nbatches, lX, lS_offsets, lS_indices, lT)
def generate_random_output_batch(n, num_targets, round_targets=False):
# target (probability of a click)
if round_targets:
P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32)
else:
P = ra.rand(n, num_targets).astype(np.float32)
return torch.tensor(P)
# uniform ditribution (input data)
def generate_uniform_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
length,
):
# dense feature
Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
# sparse feature (sparse indices)
lS_emb_offsets = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in ln_emb:
lS_batch_offsets = []
lS_batch_indices = []
offset = 0
for _ in range(n):
# num of sparse indices to be used per embedding (between
if num_indices_per_lookup_fixed:
sparse_group_size = np.int64(num_indices_per_lookup)
else:
# random between [1,num_indices_per_lookup])
r = ra.random(1)
sparse_group_size = np.int64(
np.round(max([1.0], r * min(size, num_indices_per_lookup)))
)
# sparse indices to be used per embedding
r = ra.random(sparse_group_size)
sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int32(sparse_group.size)
# store lengths and indices
if length: # for caffe2 version
lS_batch_offsets += [sparse_group_size]
else:
lS_batch_offsets += [offset]
lS_batch_indices += sparse_group.tolist()
# update offset for next iteration
offset += sparse_group_size
lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
lS_emb_indices.append(torch.tensor(lS_batch_indices))
return (Xt, lS_emb_offsets, lS_emb_indices)
# random data from uniform or gaussian ditribution (input data)
def generate_dist_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
rand_data_dist,
rand_data_min,
rand_data_max,
rand_data_mu,
rand_data_sigma,
):
# dense feature
Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
# sparse feature (sparse indices)
lS_emb_offsets = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in ln_emb:
lS_batch_offsets = []
lS_batch_indices = []
offset = 0
for _ in range(n):
# num of sparse indices to be used per embedding (between
if num_indices_per_lookup_fixed:
sparse_group_size = np.int64(num_indices_per_lookup)
else:
# random between [1,num_indices_per_lookup])
r = ra.random(1)
sparse_group_size = np.int64(
np.round(max([1.0], r * min(size, num_indices_per_lookup)))
)
# sparse indices to be used per embedding
if rand_data_dist == "gaussian":
if rand_data_mu == -1:
rand_data_mu = (rand_data_max + rand_data_min) / 2.0
r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size)
sparse_group = np.clip(r, rand_data_min, rand_data_max)
sparse_group = np.unique(sparse_group).astype(np.int64)
elif rand_data_dist == "uniform":
r = ra.random(sparse_group_size)
sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
else:
raise (
rand_data_dist,
"distribution is not supported. \
please select uniform or gaussian",
)
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int64(sparse_group.size)
# store lengths and indices
lS_batch_offsets += [offset]
lS_batch_indices += sparse_group.tolist()
# update offset for next iteration
offset += sparse_group_size
lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
lS_emb_indices.append(torch.tensor(lS_batch_indices))
return (Xt, lS_emb_offsets, lS_emb_indices)
# synthetic distribution (input data)
def generate_synthetic_input_batch(
m_den,
ln_emb,
n,
num_indices_per_lookup,
num_indices_per_lookup_fixed,
trace_file,
enable_padding=False,
):
# dense feature
Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
# sparse feature (sparse indices)
lS_emb_offsets = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for i, size in enumerate(ln_emb):
lS_batch_offsets = []
lS_batch_indices = []
offset = 0
for _ in range(n):
# num of sparse indices to be used per embedding (between
if num_indices_per_lookup_fixed:
sparse_group_size = np.int64(num_indices_per_lookup)
else:
# random between [1,num_indices_per_lookup])
r = ra.random(1)
sparse_group_size = np.int64(
max(1, np.round(r * min(size, num_indices_per_lookup))[0])
)
# sparse indices to be used per embedding
file_path = trace_file
line_accesses, list_sd, cumm_sd = read_dist_from_file(
file_path.replace("j", str(i))
)
# debug prints
# print("input")
# print(line_accesses); print(list_sd); print(cumm_sd);
# print(sparse_group_size)
# approach 1: rand
# r = trace_generate_rand(
# line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
# )
# approach 2: lru
r = trace_generate_lru(
line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
)
# WARNING: if the distribution in the file is not consistent
# with embedding table dimensions, below mod guards against out
# of range access
sparse_group = np.unique(r).astype(np.int64)
minsg = np.min(sparse_group)
maxsg = np.max(sparse_group)
if (minsg < 0) or (size <= maxsg):
print(
"WARNING: distribution is inconsistent with embedding "
+ "table size (using mod to recover and continue)"
)
sparse_group = np.mod(sparse_group, size).astype(np.int64)
# sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64))
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int64(sparse_group.size)
# store lengths and indices
lS_batch_offsets += [offset]
lS_batch_indices += sparse_group.tolist()
# update offset for next iteration
offset += sparse_group_size
lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
lS_emb_indices.append(torch.tensor(lS_batch_indices))
return (Xt, lS_emb_offsets, lS_emb_indices)
def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
u = ra.rand(1)
if i < max_i:
# only generate stack distances up to the number of new references seen so far
j = bisect.bisect(cumm_val, i) - 1
fi = cumm_dist[j]
u *= fi # shrink distribution support to exclude last values
elif enable_padding:
# WARNING: disable generation of new references (once all have been seen)
fi = cumm_dist[0]
u = (1.0 - fi) * u + fi # remap distribution support to exclude first value
for j, f in enumerate(cumm_dist):
if u <= f:
return cumm_val[j]
# WARNING: global define, must be consistent across all synthetic functions
cache_line_size = 1
def trace_generate_lru(
line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
):
max_sd = list_sd[-1]
l = len(line_accesses)
i = 0
ztrace = deque()
for _ in range(out_trace_len):
sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
# generate memory reference
if sd == 0: # new reference #
line_ref = line_accesses[0]
del line_accesses[0]
line_accesses.append(line_ref)
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
i += 1
else: # existing reference #
line_ref = line_accesses[l - sd]
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
del line_accesses[l - sd]
line_accesses.append(line_ref)
# save generated memory reference
ztrace.append(mem_ref)
return ztrace
def trace_generate_rand(
line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
):
max_sd = list_sd[-1]
l = len(line_accesses) # !!!Unique,
i = 0
ztrace = []
for _ in range(out_trace_len):
sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
# generate memory reference
if sd == 0: # new reference #
line_ref = line_accesses.pop(0)
line_accesses.append(line_ref)
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
i += 1
else: # existing reference #
line_ref = line_accesses[l - sd]
mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
ztrace.append(mem_ref)
return ztrace
def trace_profile(trace, enable_padding=False):
# number of elements in the array (assuming 1D)
# n = trace.size
rstack = deque() # S
stack_distances = deque() # SDS
line_accesses = deque() # L
for x in trace:
r = np.uint64(x / cache_line_size)
l = len(rstack)
try: # found #
i = rstack.index(r)
# WARNING: I believe below is the correct depth in terms of meaning of the
# algorithm, but that is not what seems to be in the paper alg.
# -1 can be subtracted if we defined the distance between
# consecutive accesses (e.g. r, r) as 0 rather than 1.
sd = l - i # - 1
# push r to the end of stack_distances
stack_distances.appendleft(sd)
# remove r from its position and insert to the top of stack
del rstack[i] # rstack.remove(r)
rstack.append(r)
except ValueError: # not found #
sd = 0 # -1
# push r to the end of stack_distances/line_accesses
stack_distances.appendleft(sd)
line_accesses.appendleft(r)
# push r to the top of stack
rstack.append(r)
if enable_padding:
# WARNING: notice that as the ratio between the number of samples (l)
# and cardinality [c] of a sample increases the probability of
# generating a sample gets smaller and smaller because there are
# few new samples compared to repeated samples. This means that for a
# long trace with relatively small cardinality it will take longer to
# generate all new samples and therefore obtain full distribution support
# and hence it takes longer for distribution to resemble the original.
# Therefore, we may pad the number of new samples to be on par with
# average number of samples l/c artificially.
l = len(stack_distances)
c = max(stack_distances)
padding = int(np.ceil(l / c))
stack_distances = stack_distances + [0] * padding
return (rstack, stack_distances, line_accesses)
# auxiliary read/write routines
def read_trace_from_file(file_path):
try:
with open(file_path) as f:
if args.trace_file_binary_type:
array = np.fromfile(f, dtype=np.uint64)
trace = array.astype(np.uint64).tolist()
else:
line = f.readline()
trace = list(map(lambda x: np.uint64(x), line.split(", ")))
return trace
except Exception:
print(f"ERROR: trace file '{file_path}' is not available.")
def write_trace_to_file(file_path, trace):
try:
if args.trace_file_binary_type:
with open(file_path, "wb+") as f:
np.array(trace).astype(np.uint64).tofile(f)
else:
with open(file_path, "w+") as f:
s = str(list(trace))
f.write(s[1 : len(s) - 1])
except Exception:
print("ERROR: no output trace file has been provided")
def read_dist_from_file(file_path):
try:
with open(file_path, "r") as f:
lines = f.read().splitlines()
except Exception:
print("{file_path} Wrong file or file path")
# read unique accesses
unique_accesses = [int(el) for el in lines[0].split(", ")]
# read cumulative distribution (elements are passed as two separate lists)
list_sd = [int(el) for el in lines[1].split(", ")]
cumm_sd = [float(el) for el in lines[2].split(", ")]
return unique_accesses, list_sd, cumm_sd
def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
try:
with open(file_path, "w") as f:
# unique_acesses
s = str(list(unique_accesses))
f.write(s[1 : len(s) - 1] + "\n")
# list_sd
s = str(list_sd)
f.write(s[1 : len(s) - 1] + "\n")
# cumm_sd
s = str(list(cumm_sd))
f.write(s[1 : len(s) - 1] + "\n")
except Exception:
print("Wrong file or file path")
if __name__ == "__main__":
import argparse
import operator
### parse arguments ###
parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
parser.add_argument("--trace-file", type=str, default="./input/trace.log")
parser.add_argument("--trace-file-binary-type", type=bool, default=False)
parser.add_argument("--trace-enable-padding", type=bool, default=False)
parser.add_argument("--dist-file", type=str, default="./input/dist.log")
parser.add_argument(
"--synthetic-file", type=str, default="./input/trace_synthetic.log"
)
parser.add_argument("--numpy-rand-seed", type=int, default=123)
parser.add_argument("--print-precision", type=int, default=5)
args = parser.parse_args()
### some basic setup ###
np.random.seed(args.numpy_rand_seed)
np.set_printoptions(precision=args.print_precision)
### read trace ###
trace = read_trace_from_file(args.trace_file)
# print(trace)
### profile trace ###
(_, stack_distances, line_accesses) = trace_profile(
trace, args.trace_enable_padding
)
stack_distances.reverse()
line_accesses.reverse()
# print(line_accesses)
# print(stack_distances)
### compute probability distribution ###
# count items
l = len(stack_distances)
dc = sorted(
collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
)
# create a distribution
list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0]
dist_sd = list(
map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
) # k = tuple_x_k[1]
cumm_sd = deque() # np.cumsum(dc).tolist() #prefixsum
for i, (_, k) in enumerate(dc):
if i == 0:
cumm_sd.append(k / float(l))
else:
# add the 2nd element of the i-th tuple in the dist_sd list
cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
### write stack_distance and line_accesses to a file ###
write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
### generate corresponding synthetic ###
# line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
synthetic_trace = trace_generate_lru(
line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
)
# synthetic_trace = trace_generate_rand(
# line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
# )
write_trace_to_file(args.synthetic_file, synthetic_trace)
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: an implementation of a deep learning recommendation model (DLRM)
# The model input consists of dense and sparse features. The former is a vector
# of floating point values. The latter is a list of sparse indices into
# embedding tables, which consist of vectors of floating point values.
# The selected vectors are passed to mlp networks denoted by triangles,
# in some cases the vectors are interacted through operators (Ops).
#
# output:
# vector of values
# model: |
# /\
# /__\
# |
# _____________________> Op <___________________
# / | \
# /\ /\ /\
# /__\ /__\ ... /__\
# | | |
# | Op Op
# | ____/__\_____ ____/__\____
# | |_Emb_|____|__| ... |_Emb_|__|___|
# input:
# [ dense features ] [sparse indices] , ..., [sparse indices]
#
# More precise definition of model layers:
# 1) fully connected layers of an mlp
# z = f(y)
# y = Wx + b
#
# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
# z = Op(e1,...,ek)
# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
#
# 3) Operator Op can be one of the following
# Sum(e1,...,ek) = e1 + ... + ek
# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
# Cat(e1,...,ek) = [e1', ..., ek']'
# where ' denotes transpose operation
#
# References:
# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import functools
# others
import operator
import time
# onnx
# The onnx import causes deprecation warnings every time workers
# are spawned during testing. So, we filter out those warnings.
import warnings
# data generation
import dlrm_data_pytorch as dp
# numpy
import numpy as np
import sklearn.metrics
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
try:
import caffe2.python.onnx.frontend
import onnx
except ImportError as error:
print("Unable to import onnx or caffe2.python.onnx.frontend ", error)
# from caffe2.python import data_parallel_model
# caffe2
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace
"""
# auxiliary routine used to split input on the mini-bacth dimension
def where_to_split(mini_batch_size, ndevices, _add_leftover=False):
n = (mini_batch_size + ndevices - 1) // ndevices # ceiling
l = mini_batch_size - n * (ndevices - 1) # leftover
s = [n] * (ndevices - 1)
if _add_leftover:
ls += [l if l > 0 else n]
return ls
"""
### define dlrm in Caffe2 ###
class DLRM_Net(object):
def FeedBlobWrapper(self, tag, val, add_prefix=True, split=False, device_id=-1):
if self.ndevices > 1 and add_prefix:
if split:
# split across devices
mini_batch_size = val.shape[0]
# approach 1: np and caffe2 operators assume the mini-batch size is
# divisible exactly by the number of available devices
if mini_batch_size % self.ndevices != 0:
sys.exit(
"ERROR: caffe2 net assumes that the mini_batch_size "
+ str(mini_batch_size)
+ " is evenly divisible by the number of available devices"
+ str(self.ndevices)
)
vals = np.split(val, self.ndevices, axis=0)
"""
# approach 2: np and caffe2 operators do not assume exact divisibility
if args.mini_batch_size != mini_batch_size:
sys.exit("ERROR: caffe2 net was prepared for mini-batch size "
+ str(args.mini_batch_size)
+ " which is different from current mini-batch size "
+ str(mini_batch_size) + " being passed to it. "
+ "This is common for the last mini-batch, when "
+ "mini-batch size does not evenly divided the number of "
+ "elements in the data set.")
ls = where_to_split(mini_batch_size, self.ndevices)
vals = np.split(val, ls, axis=0)
"""
# feed to multiple devices
for d in range(self.ndevices):
tag_on_device = "gpu_" + str(d) + "/" + tag
_d = core.DeviceOption(workspace.GpuDeviceType, d)
workspace.FeedBlob(tag_on_device, vals[d], device_option=_d)
else:
# feed to multiple devices
for d in range(self.ndevices):
tag_on_device = "gpu_" + str(d) + "/" + tag
_d = core.DeviceOption(workspace.GpuDeviceType, d)
workspace.FeedBlob(tag_on_device, val, device_option=_d)
else:
# feed to a single device (named or not)
if device_id >= 0:
_d = core.DeviceOption(workspace.GpuDeviceType, device_id)
workspace.FeedBlob(tag, val, device_option=_d)
else:
workspace.FeedBlob(tag, val)
def FetchBlobWrapper(self, tag, add_prefix=True, reduce_across=None, device_id=-1):
if self.ndevices > 1 and add_prefix:
# fetch from multiple devices
vals = []
for d in range(self.ndevices):
if tag.__class__ == list:
tag_on_device = tag[d]
else:
tag_on_device = "gpu_" + str(0) + "/" + tag
val = workspace.FetchBlob(tag_on_device)
vals.append(val)
# reduce across devices
if reduce_across == "add":
return functools.reduce(operator.add, vals)
elif reduce_across == "concat":
return np.concatenate(vals)
else:
return vals
else:
# fetch from a single device (named or not)
if device_id >= 0:
tag_on_device = "gpu_" + str(device_id) + "/" + tag
return workspace.FetchBlob(tag_on_device)
else:
return workspace.FetchBlob(tag)
def AddLayerWrapper(
self, layer, inp_blobs, out_blobs, add_prefix=True, reset_grad=False, **kwargs
):
# auxiliary routine to adjust tags
def adjust_tag(blobs, on_device):
if blobs.__class__ == str:
_blobs = on_device + blobs
elif blobs.__class__ == list:
_blobs = list(map(lambda tag: on_device + tag, blobs))
else: # blobs.__class__ == model_helper.ModelHelper or something else
_blobs = blobs
return _blobs
if self.ndevices > 1 and add_prefix:
# add layer on multiple devices
ll = []
for d in range(self.ndevices):
# add prefix on_device
on_device = "gpu_" + str(d) + "/"
_inp_blobs = adjust_tag(inp_blobs, on_device)
_out_blobs = adjust_tag(out_blobs, on_device)
# WARNING: reset_grad option was exlusively designed for WeightedSum
# with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
if reset_grad:
w_grad = self.gradientMap[_inp_blobs[0]]
_inp_blobs[2] = w_grad
# add layer to the model
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
if kwargs:
new_layer = layer(_inp_blobs, _out_blobs, **kwargs)
else:
new_layer = layer(_inp_blobs, _out_blobs)
ll.append(new_layer)
return ll
else:
# add layer on a single device
# WARNING: reset_grad option was exlusively designed for WeightedSum
# with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
if reset_grad:
w_grad = self.gradientMap[inp_blobs[0]]
inp_blobs[2] = w_grad
# add layer to the model
if kwargs:
new_layer = layer(inp_blobs, out_blobs, **kwargs)
else:
new_layer = layer(inp_blobs, out_blobs)
return new_layer
def create_mlp(self, ln, sigmoid_layer, model, tag):
(tag_layer, tag_in, tag_out) = tag
# build MLP layer by layer
layers = []
weights = []
for i in range(1, ln.size):
n = ln[i - 1]
m = ln[i]
# create tags
tag_fc_w = tag_layer + ":::" + "fc" + str(i) + "_w"
tag_fc_b = tag_layer + ":::" + "fc" + str(i) + "_b"
tag_fc_y = tag_layer + ":::" + "fc" + str(i) + "_y"
tag_fc_z = tag_layer + ":::" + "fc" + str(i) + "_z"
if i == ln.size - 1:
tag_fc_z = tag_out
weights.append(tag_fc_w)
weights.append(tag_fc_b)
# initialize the weights
# approach 1: custom Xavier input, output or two-sided fill
mean = 0.0 # std_dev = np.sqrt(variance)
std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n)
W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1))
b = np.random.normal(mean, std_dev, size=m).astype(np.float32)
self.FeedBlobWrapper(tag_fc_w, W)
self.FeedBlobWrapper(tag_fc_b, b)
# approach 2: caffe2 xavier
# W = self.AddLayerWrapper(
# model.param_init_net.XavierFill,
# [],
# tag_fc_w,
# shape=[m, n]
# )
# b = self.AddLayerWrapper(
# model.param_init_net.ConstantFill,
# [],
# tag_fc_b,
# shape=[m]
# )
# initialize the MLP's momentum for the Adagrad optimizer
if self.emb_optimizer in ["adagrad", "rwsadagrad"]:
# momentum of the weights
self.FeedBlobWrapper(
"momentum_mlp_{}_{}".format(tag_layer, 2 * i - 1),
np.full((m, n), 0, dtype=np.float32),
)
# momentum of the biases
self.FeedBlobWrapper(
"momentum_mlp_{}_{}".format(tag_layer, 2 * i),
np.full((m), 0, dtype=np.float32),
)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[tag_fc_w] = (onnx.TensorProto.FLOAT, W.shape)
self.onnx_tsd[tag_fc_b] = (onnx.TensorProto.FLOAT, b.shape)
# approach 1: construct fully connected operator using model.net
fc = self.AddLayerWrapper(
model.net.FC, [tag_in, tag_fc_w, tag_fc_b], tag_fc_y
)
# approach 2: construct fully connected operator using brew
# https://github.com/caffe2/tutorials/blob/master/MNIST.ipynb
# fc = brew.fc(model, layer, tag_fc_w, dim_in=m, dim_out=n)
layers.append(fc)
if i == sigmoid_layer:
# approach 1: construct sigmoid operator using model.net
layer = self.AddLayerWrapper(model.net.Sigmoid, tag_fc_y, tag_fc_z)
# approach 2: using brew (which currently does not support sigmoid)
# tag_sigm = tag_layer + ":::" + "sigmoid" + str(i)
# layer = brew.sigmoid(model,fc,tag_sigmoid)
else:
# approach 1: construct relu operator using model.net
layer = self.AddLayerWrapper(model.net.Relu, tag_fc_y, tag_fc_z)
# approach 2: using brew
# tag_relu = tag_layer + ":::" + "relu" + str(i)
# layer = brew.relu(model,fc,tag_relu)
tag_in = tag_fc_z
layers.append(layer)
# WARNING: the dependency between layers is implicit in the tags,
# so only the last layer is added to the layers list. It will
# later be used for interactions.
return layers, weights
def create_emb(self, m, ln, model, tag):
(tag_layer, tag_in, tag_out) = tag
emb_l = []
weights_l = []
vw_l = []
for i in range(0, ln.size):
n = ln[i]
# select device
if self.ndevices > 1:
d = i % self.ndevices
else:
d = -1
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
len_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_l"
ind_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_i"
tbl_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_w"
sum_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_z"
weights_l.append(tbl_s)
# initialize the weights
# approach 1a: custom
W = np.random.uniform(
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
).astype(np.float32)
# approach 1b: numpy rand
# W = ra.rand(n, m).astype(np.float32)
self.FeedBlobWrapper(tbl_s, W, False, device_id=d)
# approach 2: caffe2 xavier
# with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
# W = model.param_init_net.XavierFill([], tbl_s, shape=[n, m])
# save the blob shapes for latter (only needed if onnx is requested)
# initialize the embedding's momentum for the Adagrad optimizer
if self.emb_optimizer == "adagrad":
self.FeedBlobWrapper(
"momentum_emb_{}".format(i),
np.full((n, m), 0),
add_prefix=False,
device_id=d,
)
elif self.emb_optimizer == "rwsadagrad":
self.FeedBlobWrapper(
"momentum_emb_{}".format(i),
np.full((n), 0),
add_prefix=False,
device_id=d,
)
if self.save_onnx:
self.onnx_tsd[tbl_s] = (onnx.TensorProto.FLOAT, W.shape)
# create operator
if self.weighted_pooling is not None:
vw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_v"
psw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_s"
VW = np.ones(n).astype(np.float32)
self.FeedBlobWrapper(vw_s, VW, False, device_id=d)
if self.weighted_pooling == "learned":
vw_l.append(vw_s)
grad_on_weights = True
else:
grad_on_weights = False
if self.save_onnx:
self.onnx_tsd[vw_s] = (onnx.TensorProto.FLOAT, VW.shape)
if self.ndevices <= 1:
PSW = model.net.Gather([vw_s, ind_s], [psw_s])
EE = model.net.SparseLengthsWeightedSum(
[tbl_s, PSW, ind_s, len_s],
[sum_s],
grad_on_weights=grad_on_weights,
)
else:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, d)
):
PSW = model.net.Gather([vw_s, ind_s], [psw_s])
EE = model.net.SparseLengthsWeightedSum(
[tbl_s, PSW, ind_s, len_s],
[sum_s],
grad_on_weights=grad_on_weights,
)
else:
if self.ndevices <= 1:
EE = model.net.SparseLengthsSum([tbl_s, ind_s, len_s], [sum_s])
else:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, d)
):
EE = model.net.SparseLengthsSum([tbl_s, ind_s, len_s], [sum_s])
emb_l.append(EE)
return emb_l, weights_l, vw_l
def create_interactions(self, x, ly, model, tag):
(tag_dense_in, tag_sparse_in, tag_int_out) = tag
if self.arch_interaction_op == "dot":
# concatenate dense and sparse features
tag_int_out_info = tag_int_out + "_info"
T, T_info = model.net.Concat(
x + ly,
[tag_int_out + "_cat_axis0", tag_int_out_info + "_cat_axis0"],
axis=1,
add_axis=1,
)
# perform a dot product
Z = model.net.BatchMatMul([T, T], tag_int_out + "_matmul", trans_b=1)
# append dense feature with the interactions (into a row vector)
# approach 1: all
# Zflat = model.net.Flatten(Z, tag_int_out + "_flatten", axis=1)
# approach 2: unique
Zflat_all = model.net.Flatten(Z, tag_int_out + "_flatten_all", axis=1)
Zflat = model.net.BatchGather(
[Zflat_all, tag_int_out + "_tril_indices"], tag_int_out + "_flatten"
)
R, R_info = model.net.Concat(
x + [Zflat], [tag_int_out, tag_int_out_info], axis=1
)
elif self.arch_interaction_op == "cat":
# concatenation features (into a row vector)
tag_int_out_info = tag_int_out + "_info"
R, R_info = model.net.Concat(
x + ly, [tag_int_out, tag_int_out_info], axis=1
)
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ self.arch_interaction_op
+ " is not supported"
)
return R
def create_sequential_forward_ops(self):
# embeddings
tag = (self.temb, self.tsin, self.tsout)
self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
self.m_spa, self.ln_emb, self.model, tag
)
# bottom mlp
tag = (self.tbot, self.tdin, self.tdout)
self.bot_l, self.bot_w = self.create_mlp(
self.ln_bot, self.sigmoid_bot, self.model, tag
)
# interactions
tag = (self.tdout, self.tsout, self.tint)
Z = self.create_interactions([self.bot_l[-1]], self.emb_l, self.model, tag)
# top mlp
tag = (self.ttop, Z, self.tout)
self.top_l, self.top_w = self.create_mlp(
self.ln_top, self.sigmoid_top, self.model, tag
)
# debug prints
# print(self.emb_l)
# print(self.bot_l)
# print(self.top_l)
# setup the last output variable
self.last_output = self.top_l[-1]
def create_parallel_forward_ops(self):
# distribute embeddings (model parallelism)
tag = (self.temb, self.tsin, self.tsout)
self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
self.m_spa, self.ln_emb, self.model, tag
)
# replicate mlp (data parallelism)
tag = (self.tbot, self.tdin, self.tdout)
self.bot_l, self.bot_w = self.create_mlp(
self.ln_bot, self.sigmoid_bot, self.model, tag
)
# add communication (butterfly shuffle)
t_list = []
for i, emb_output in enumerate(self.emb_l):
# split input
src_d = i % self.ndevices
lo = [emb_output + "_split_" + str(d) for d in range(self.ndevices)]
# approach 1: np and caffe2 operators assume the mini-batch size is
# divisible exactly by the number of available devices
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
self.model.net.Split(emb_output, lo, axis=0)
"""
# approach 2: np and caffe2 operators do not assume exact divisibility
ls = where_to_split(args.mini_batch_size, self.ndevices, _add_leftover=True)
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
emb_output_split = self.model.net.Split(
emb_output, lo, split=lp, axis=0
)
"""
# scatter
y = []
for dst_d in range(len(lo)):
src_blob = lo[dst_d]
dst_blob = str(src_blob).replace(
"gpu_" + str(src_d), "gpu_" + str(dst_d), 1
)
if src_blob != dst_blob:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, dst_d)
):
blob = self.model.Copy(src_blob, dst_blob)
else:
blob = dst_blob
y.append(blob)
t_list.append(y)
# adjust lists to be ordered per device
x = list(map(lambda x: list(x), zip(*self.bot_l)))
ly = list(map(lambda y: list(y), zip(*t_list)))
# interactions
for d in range(self.ndevices):
on_device = "gpu_" + str(d) + "/"
tag = (
on_device + self.tdout,
on_device + self.tsout,
on_device + self.tint,
)
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
self.create_interactions([x[d][-1]], ly[d], self.model, tag)
# replicate mlp (data parallelism)
tag = (self.ttop, self.tint, self.tout)
self.top_l, self.top_w = self.create_mlp(
self.ln_top, self.sigmoid_top, self.model, tag
)
# debug prints
# print(self.model.net.Proto(),end='\n')
# sys.exit("ERROR: debugging")
# setup the last output variable
self.last_output = self.top_l[-1]
def __init__(
self,
m_spa,
ln_emb,
ln_bot,
ln_top,
arch_interaction_op,
arch_interaction_itself=False,
sigmoid_bot=-1,
sigmoid_top=-1,
save_onnx=False,
model=None,
test_net=None,
tag=None,
ndevices=-1,
forward_ops=True,
enable_prof=False,
weighted_pooling=None,
emb_optimizer="sgd",
):
super(DLRM_Net, self).__init__()
# init model
if model is None:
global_init_opt = ["caffe2", "--caffe2_log_level=0"]
if enable_prof:
global_init_opt += [
"--logtostderr=0",
"--log_dir=$HOME",
"--caffe2_logging_print_net_summary=1",
]
workspace.GlobalInit(global_init_opt)
self.set_tags()
self.model = model_helper.ModelHelper(name="DLRM", init_params=True)
self.test_net = None
else:
# WARNING: assume that workspace and tags have been initialized elsewhere
self.set_tags(
tag[0],
tag[1],
tag[2],
tag[3],
tag[4],
tag[5],
tag[6],
tag[7],
tag[8],
tag[9],
)
self.model = model
self.test_net = test_net
# save arguments
self.m_spa = m_spa
self.ln_emb = ln_emb
self.ln_bot = ln_bot
self.ln_top = ln_top
self.arch_interaction_op = arch_interaction_op
self.arch_interaction_itself = arch_interaction_itself
self.sigmoid_bot = sigmoid_bot
self.sigmoid_top = sigmoid_top
self.save_onnx = save_onnx
self.ndevices = ndevices
self.emb_optimizer = emb_optimizer
if weighted_pooling is not None and weighted_pooling != "fixed":
self.weighted_pooling = "learned"
else:
self.weighted_pooling = weighted_pooling
# onnx types and shapes dictionary
if self.save_onnx:
self.onnx_tsd = {}
# create forward operators
if forward_ops:
if self.ndevices <= 1:
return self.create_sequential_forward_ops()
else:
return self.create_parallel_forward_ops()
def set_tags(
self,
_tag_layer_top_mlp="top",
_tag_layer_bot_mlp="bot",
_tag_layer_embedding="emb",
_tag_feature_dense_in="dense_in",
_tag_feature_dense_out="dense_out",
_tag_feature_sparse_in="sparse_in",
_tag_feature_sparse_out="sparse_out",
_tag_interaction="interaction",
_tag_dense_output="prob_click",
_tag_dense_target="target",
):
# layer tags
self.ttop = _tag_layer_top_mlp
self.tbot = _tag_layer_bot_mlp
self.temb = _tag_layer_embedding
# dense feature tags
self.tdin = _tag_feature_dense_in
self.tdout = _tag_feature_dense_out
# sparse feature tags
self.tsin = _tag_feature_sparse_in
self.tsout = _tag_feature_sparse_out
# output and target tags
self.tint = _tag_interaction
self.ttar = _tag_dense_target
self.tout = _tag_dense_output
def parameters(self):
return self.model
def get_loss(self):
return self.FetchBlobWrapper(self.loss, reduce_across="add")
def get_output(self):
return self.FetchBlobWrapper(self.last_output, reduce_across="concat")
def create(self, X, S_lengths, S_indices, T):
self.create_input(X, S_lengths, S_indices, T)
self.create_model(X, S_lengths, S_indices, T)
def create_input(self, X, S_lengths, S_indices, T):
# feed input data to blobs
self.FeedBlobWrapper(self.tdin, X, split=True)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[self.tdin] = (onnx.TensorProto.FLOAT, X.shape)
for i in range(len(self.emb_l)):
# select device
if self.ndevices > 1:
d = i % self.ndevices
else:
d = -1
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
lshape = (len(S_lengths[i]),) # =args.mini_batch_size
ishape = (len(S_indices[i]),)
self.onnx_tsd[len_s] = (onnx.TensorProto.INT32, lshape)
self.onnx_tsd[ind_s] = (onnx.TensorProto.INT32, ishape)
# feed target data to blobs
if T is not None:
zeros_fp32 = np.zeros(T.shape).astype(np.float32)
self.FeedBlobWrapper(self.ttar, zeros_fp32, split=True)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[self.ttar] = (onnx.TensorProto.FLOAT, T.shape)
def create_model(self, X, S_lengths, S_indices, T):
# setup tril indices for the interactions
offset = 1 if self.arch_interaction_itself else 0
num_fea = len(self.emb_l) + 1
tril_indices = np.array(
[j + i * num_fea for i in range(num_fea) for j in range(i + offset)]
)
self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices)
# create compute graph
if T is not None:
# WARNING: RunNetOnce call is needed only if we use brew and ConstantFill.
# We could use direct calls to self.model functions above to avoid it
workspace.RunNetOnce(self.model.param_init_net)
workspace.CreateNet(self.model.net)
if self.test_net is not None:
workspace.CreateNet(self.test_net)
def run(self, X, S_lengths, S_indices, T, test_net=False, enable_prof=False):
# feed input data to blobs
# dense features
self.FeedBlobWrapper(self.tdin, X, split=True)
# sparse features
for i in range(len(self.emb_l)):
# select device
if self.ndevices > 1:
d = i % self.ndevices
else:
d = -1
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
# feed target data to blobs if needed
if T is not None:
self.FeedBlobWrapper(self.ttar, T, split=True)
# execute compute graph
if test_net:
workspace.RunNet(self.test_net)
else:
if enable_prof:
workspace.C.benchmark_net(self.model.net.Name(), 0, 1, True)
else:
workspace.RunNet(self.model.net)
# debug prints
# print("intermediate")
# print(self.FetchBlobWrapper(self.bot_l[-1]))
# for tag_emb in self.emb_l:
# print(self.FetchBlobWrapper(tag_emb))
# print(self.FetchBlobWrapper(self.tint))
def MSEloss(self, scale=1.0):
# add MSEloss to the model
self.AddLayerWrapper(self.model.SquaredL2Distance, [self.tout, self.ttar], "sd")
self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=2.0 * scale)
# WARNING: "loss" is a special tag and should not be changed
self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
def BCEloss(self, scale=1.0, threshold=0.0):
# add BCEloss to the mode
if 0.0 < threshold and threshold < 1.0:
self.AddLayerWrapper(
self.model.Clip,
self.tout,
"tout_c",
min=threshold,
max=(1.0 - threshold),
)
self.AddLayerWrapper(self.model.MakeTwoClass, "tout_c", "tout_2c")
else:
self.AddLayerWrapper(self.model.MakeTwoClass, self.tout, "tout_2c")
self.AddLayerWrapper(self.model.LabelCrossEntropy, ["tout_2c", self.ttar], "sd")
# WARNING: "loss" is a special tag and should not be changed
if scale == 1.0:
self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd", "loss")
else:
self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=scale)
self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
def sgd_optimizer(
self, learning_rate, T=None, _gradientMap=None, sync_dense_params=True
):
# create one, it and lr tags (or use them if already present)
if T is not None:
(tag_one, tag_it, tag_lr) = T
else:
(tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
# approach 1: feed values directly
# self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
# self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
# it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
# lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
# base_lr=-1 * learning_rate, policy="fixed")
# approach 2: use brew
self.AddLayerWrapper(
self.model.param_init_net.ConstantFill,
[],
tag_one,
shape=[1],
value=1.0,
)
self.AddLayerWrapper(brew.iter, self.model, tag_it)
self.AddLayerWrapper(
self.model.LearningRate,
tag_it,
tag_lr,
base_lr=-1 * learning_rate,
policy="fixed",
)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
# create gradient maps (or use them if already present)
if _gradientMap is not None:
self.gradientMap = _gradientMap
else:
if self.loss.__class__ == list:
self.gradientMap = self.model.AddGradientOperators(self.loss)
else:
self.gradientMap = self.model.AddGradientOperators([self.loss])
# update weights
# approach 1: builtin function
# optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
# approach 2: custom code
# top MLP weight and bias
for w in self.top_w:
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.AddLayerWrapper(
self.model.WeightedSum, [w, tag_one, "", tag_lr], w, reset_grad=True
)
# bottom MLP weight and bias
for w in self.bot_w:
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.AddLayerWrapper(
self.model.WeightedSum, [w, tag_one, "", tag_lr], w, reset_grad=True
)
# update embeddings
for i, w in enumerate(self.emb_w):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
if self.ndevices > 1:
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
else:
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
# update per sample weights
if self.weighted_pooling == "learned":
for i, w in enumerate(self.emb_vw):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
if self.ndevices > 1:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, d)
):
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
else:
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
def adagrad_optimizer(
self,
learning_rate,
T=None,
_gradientMap=None,
sync_dense_params=True,
epsilon=1e-10,
decay_=0.0,
weight_decay_=0.0,
):
# create one, it and lr tags (or use them if already present)
if T is not None:
(tag_one, tag_it, tag_lr) = T
else:
(tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
# approach 1: feed values directly
# self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
# self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
# it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
# lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
# base_lr=-1 * learning_rate, policy="fixed")
# approach 2: use brew
self.AddLayerWrapper(
self.model.param_init_net.ConstantFill,
[],
tag_one,
shape=[1],
value=1.0,
)
self.AddLayerWrapper(brew.iter, self.model, tag_it)
self.AddLayerWrapper(
self.model.LearningRate,
tag_it,
tag_lr,
base_lr=-1 * learning_rate,
policy="fixed",
)
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
# create gradient maps (or use them if already present)
if _gradientMap is not None:
self.gradientMap = _gradientMap
else:
if self.loss.__class__ == list:
self.gradientMap = self.model.AddGradientOperators(self.loss)
else:
self.gradientMap = self.model.AddGradientOperators([self.loss])
# update weights
# approach 1: builtin function
# optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
# approach 2: custom code
# top MLP weight and bias
for i, w in enumerate(self.top_w):
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.model.Adagrad(
[w, "momentum_mlp_top_{}".format(i + 1), self.gradientMap[w], tag_lr],
[w, "momentum_mlp_top_{}".format(i + 1)],
epsilon=epsilon,
decay_=decay_,
weight_decay_=weight_decay_,
)
# bottom MLP weight and bias
for i, w in enumerate(self.bot_w):
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.model.Adagrad(
[w, "momentum_mlp_bot_{}".format(i + 1), self.gradientMap[w], tag_lr],
[w, "momentum_mlp_bot_{}".format(i + 1)],
epsilon=epsilon,
decay_=decay_,
weight_decay_=weight_decay_,
)
# update embeddings
for i, w in enumerate(self.emb_w):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
def add_optimizer():
self.model.Unique(
w_grad.indices,
["unique_w_grad_indices", "remapping_w_grad_indices"],
)
self.model.UnsortedSegmentSum(
[w_grad.values, "remapping_w_grad_indices"], "unique_w_grad_values"
)
if self.emb_optimizer == "adagrad":
self.model.SparseAdagrad(
[
w,
"momentum_emb_{}".format(i),
"unique_w_grad_indices",
"unique_w_grad_values",
_tag_lr,
],
[w, "momentum_emb_{}".format(i)],
epsilon=epsilon,
decay_=decay_,
weight_decay_=weight_decay_,
)
elif self.emb_optimizer == "rwsadagrad":
self.model.RowWiseSparseAdagrad(
[
w,
"momentum_emb_{}".format(i),
"unique_w_grad_indices",
"unique_w_grad_values",
_tag_lr,
],
[w, "momentum_emb_{}".format(i)],
epsilon=epsilon,
decay_=decay_,
weight_decay_=weight_decay_,
)
if self.ndevices > 1:
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
add_optimizer()
else:
add_optimizer()
# update per sample weights
if self.weighted_pooling == "learned":
for i, w in enumerate(self.emb_vw):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
if self.ndevices > 1:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, d)
):
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
else:
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)
def print_all(self):
# approach 1: all
print(workspace.Blobs(), end="\n")
for _, l in enumerate(workspace.Blobs()):
print(l)
print(self.FetchBlobWrapper(l))
# approach 2: only summary
# for param in self.model.params:
# self.model.Summarize(param, [], to_file=1)
# self.model.Summarize(self.model.param_to_grad[param], [], to_file=1)
def print_weights(self):
for _, l in enumerate(self.emb_w):
# print(l)
print(self.FetchBlobWrapper(l, False))
if self.weighted_pooling == "learned":
for _, l in enumerate(self.emb_vw):
# print(l)
print(self.FetchBlobWrapper(l, False))
for _, l in enumerate(self.bot_w):
# print(l)
if self.ndevices > 1:
print(self.FetchBlobWrapper(l, False, device_id=0))
else:
print(self.FetchBlobWrapper(l))
for _, l in enumerate(self.top_w):
# print(l)
if self.ndevices > 1:
print(self.FetchBlobWrapper(l, False, device_id=0))
else:
print(self.FetchBlobWrapper(l))
def print_activations(self):
for _, l in enumerate(self.emb_l):
print(l)
print(self.FetchBlobWrapper(l, False))
for _, l in enumerate(self.bot_l):
print(l)
print(self.FetchBlobWrapper(l))
print(self.tint)
print(self.FetchBlobWrapper(self.tint))
for _, l in enumerate(self.top_l):
print(l)
print(self.FetchBlobWrapper(l))
def define_metrics():
metrics = {
"loss": lambda y_true, y_score: sklearn.metrics.log_loss(
y_true=y_true, y_pred=y_score, labels=[0, 1]
),
"recall": lambda y_true, y_score: sklearn.metrics.recall_score(
y_true=y_true, y_pred=np.round(y_score)
),
"precision": lambda y_true, y_score: sklearn.metrics.precision_score(
y_true=y_true, y_pred=np.round(y_score)
),
"f1": lambda y_true, y_score: sklearn.metrics.f1_score(
y_true=y_true, y_pred=np.round(y_score)
),
"ap": sklearn.metrics.average_precision_score,
"roc_auc": sklearn.metrics.roc_auc_score,
"accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
y_true=y_true, y_pred=np.round(y_score)
),
# 'pre_curve' : sklearn.metrics.precision_recall_curve,
# 'roc_curve' : sklearn.metrics.roc_curve,
}
return metrics
def calculate_metrics(targets, scores):
scores = np.concatenate(scores, axis=0)
targets = np.concatenate(targets, axis=0)
metrics = define_metrics()
# print("Compute time for validation metric : ", end="")
# first_it = True
validation_results = {}
for metric_name, metric_function in metrics.items():
# if first_it:
# first_it = False
# else:
# print(", ", end="")
# metric_compute_start = time_wrap(False)
try:
validation_results[metric_name] = metric_function(targets, scores)
except Exception as error:
validation_results[metric_name] = -1
print("{} in calculating {}".format(error, metric_name))
# metric_compute_end = time_wrap(False)
# met_time = metric_compute_end - metric_compute_start
# print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
# end="")
# print(" ms")
return validation_results
if __name__ == "__main__":
import argparse
### import packages ###
import sys
### parse arguments ###
parser = argparse.ArgumentParser(
description="Train Deep Learning Recommendation Model (DLRM)"
)
# model related parameters
parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
parser.add_argument("--arch-interaction-op", type=str, default="dot")
parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
# activations and loss
parser.add_argument("--activation-function", type=str, default="relu")
parser.add_argument("--loss-function", type=str, default="mse") # or bce
parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7
parser.add_argument("--round-targets", type=bool, default=False)
parser.add_argument("--weighted-pooling", type=str, default=None)
# data
parser.add_argument("--data-size", type=int, default=1)
parser.add_argument("--num-batches", type=int, default=0)
parser.add_argument(
"--data-generation", type=str, default="random"
) # or synthetic or dataset
parser.add_argument(
"--rand-data-dist", type=str, default="uniform"
) # uniform or gaussian
parser.add_argument("--rand-data-min", type=float, default=0)
parser.add_argument("--rand-data-max", type=float, default=1)
parser.add_argument("--rand-data-mu", type=float, default=-1)
parser.add_argument("--rand-data-sigma", type=float, default=1)
parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
parser.add_argument("--raw-data-file", type=str, default="")
parser.add_argument("--processed-data-file", type=str, default="")
parser.add_argument("--data-randomize", type=str, default="total") # or day or none
parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
parser.add_argument("--max-ind-range", type=int, default=-1)
parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
parser.add_argument("--num-indices-per-lookup", type=int, default=10)
parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
parser.add_argument("--num-workers", type=int, default=0)
parser.add_argument("--memory-map", action="store_true", default=False)
# training
parser.add_argument("--mini-batch-size", type=int, default=1)
parser.add_argument("--nepochs", type=int, default=1)
parser.add_argument("--learning-rate", type=float, default=0.01)
parser.add_argument("--print-precision", type=int, default=5)
parser.add_argument("--numpy-rand-seed", type=int, default=123)
parser.add_argument("--sync-dense-params", type=bool, default=True)
parser.add_argument("--caffe2-net-type", type=str, default="")
parser.add_argument(
"--optimizer",
type=str,
default="sgd",
help="""This is the optimizer for embedding tables.""",
)
parser.add_argument(
"--dataset-multiprocessing",
action="store_true",
default=False,
help="The Kaggle dataset can be multiprocessed in an environment \
with more than 7 CPU cores and more than 20 GB of memory. \n \
The Terabyte dataset can be multiprocessed in an environment \
with more than 24 CPU cores and at least 1 TB of memory.",
)
# inference
parser.add_argument("--inference-only", action="store_true", default=False)
# onnx (or protobuf with shapes)
parser.add_argument("--save-onnx", action="store_true", default=False)
parser.add_argument("--save-proto-types-shapes", action="store_true", default=False)
# gpu
parser.add_argument("--use-gpu", action="store_true", default=False)
# debugging and profiling
parser.add_argument("--print-freq", type=int, default=1)
parser.add_argument("--test-freq", type=int, default=-1)
parser.add_argument("--test-mini-batch-size", type=int, default=-1)
parser.add_argument("--test-num-workers", type=int, default=-1)
parser.add_argument("--print-time", action="store_true", default=False)
parser.add_argument("--debug-mode", action="store_true", default=False)
parser.add_argument("--enable-profiling", action="store_true", default=False)
parser.add_argument("--plot-compute-graph", action="store_true", default=False)
# mlperf logging (disables other output and stops early)
parser.add_argument("--mlperf-logging", action="store_true", default=False)
# stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
# stop at target AUC Terabyte (no subsampling) 0.8025
parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
args = parser.parse_args()
if args.dataset_multiprocessing:
assert sys.version_info[0] >= 3 and sys.version_info[1] > 7, (
"The dataset_multiprocessing "
+ "flag is susceptible to a bug in Python 3.7 and under. "
+ "https://github.com/facebookresearch/dlrm/issues/172"
)
### some basic setup ###
# WARNING: to obtain exactly the same initialization for
# the weights we need to start from the same random seed.
np.random.seed(args.numpy_rand_seed)
np.set_printoptions(precision=args.print_precision)
if args.test_mini_batch_size < 0:
# if the parameter is not set, use the training batch size
args.test_mini_batch_size = args.mini_batch_size
if args.test_num_workers < 0:
# if the parameter is not set, use the same parameter for training
args.test_num_workers = args.num_workers
use_gpu = args.use_gpu
if use_gpu:
device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
ngpus = workspace.NumGpuDevices() # 1
print("Using {} GPU(s)...".format(ngpus))
else:
device_opt = core.DeviceOption(caffe2_pb2.CPU)
print("Using CPU...")
### prepare training data ###
ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
if args.data_generation == "dataset":
if args.num_workers > 0 or args.test_num_workers > 0:
print(
"WARNING: non default --num-workers or --test-num-workers options"
+ " are not supported and will be ignored"
)
if args.mini_batch_size != args.test_mini_batch_size:
print(
"WARNING: non default ----test-mini-batch-size option"
+ " is not supported and will be ignored"
)
# input and target from dataset
train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(
args,
offset_to_length_converter=True,
)
nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
nbatches_test = len(test_ld)
ln_emb = train_data.counts
m_den = train_data.m_den
# enforce maximum limit on number of vectors per embedding
if args.max_ind_range > 0:
ln_emb = np.array(
list(
map(
lambda x: x if x < args.max_ind_range else args.max_ind_range,
ln_emb,
)
)
)
ln_bot[0] = m_den
else:
if args.num_workers > 0 or args.test_num_workers > 0:
print(
"WARNING: non default --num-workers or --test-num-workers options"
+ " are not supported and will be ignored"
)
if args.mini_batch_size != args.test_mini_batch_size:
print(
"WARNING: non default ----test-mini-batch-size option"
+ " is not supported and will be ignored"
)
# input and target at random
ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
m_den = ln_bot[0]
train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(
args,
ln_emb,
m_den,
offset_to_length_converter=True,
)
nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
nbatches_test = len(test_ld)
# table_feature_map = {idx : idx for idx in range(len(ln_emb))}
### parse command line arguments ###
m_spa = args.arch_sparse_feature_size
ln_emb = np.asarray(ln_emb)
num_fea = ln_emb.size + 1 # num sparse + num dense features
m_den_out = ln_bot[ln_bot.size - 1]
if args.arch_interaction_op == "dot":
# approach 1: all
# num_int = num_fea * num_fea + m_den_out
# approach 2: unique
if args.arch_interaction_itself:
num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
else:
num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
elif args.arch_interaction_op == "cat":
num_int = num_fea * m_den_out
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ args.arch_interaction_op
+ " is not supported"
)
arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
# sanity check: feature sizes and mlp dimensions must match
if m_den != ln_bot[0]:
sys.exit(
"ERROR: arch-dense-feature-size "
+ str(m_den)
+ " does not match first dim of bottom mlp "
+ str(ln_bot[0])
)
if m_spa != m_den_out:
sys.exit(
"ERROR: arch-sparse-feature-size "
+ str(m_spa)
+ " does not match last dim of bottom mlp "
+ str(m_den_out)
)
if num_int != ln_top[0]:
sys.exit(
"ERROR: # of feature interactions "
+ str(num_int)
+ " does not match first dim of top mlp "
+ str(ln_top[0])
)
# test prints (model arch)
if args.debug_mode:
print("model arch:")
print(
"mlp top arch "
+ str(ln_top.size - 1)
+ " layers, with input to output dimensions:"
)
print(ln_top)
print("# of interactions")
print(num_int)
print(
"mlp bot arch "
+ str(ln_bot.size - 1)
+ " layers, with input to output dimensions:"
)
print(ln_bot)
print("# of features (sparse and dense)")
print(num_fea)
print("dense feature size")
print(m_den)
print("sparse feature size")
print(m_spa)
print(
"# of embeddings (= # of sparse features) "
+ str(ln_emb.size)
+ ", with dimensions "
+ str(m_spa)
+ "x:"
)
print(ln_emb)
print("data (inputs and targets):")
for j, inputBatch in enumerate(train_ld):
lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
print("mini-batch: %d" % j)
print(lX_j)
print(lS_l_j)
print(lS_i_j)
print(lT_j)
### construct the neural network specified above ###
# WARNING: to obtain exactly the same initialization for
# the weights we need to start from the same random seed.
# np.random.seed(args.numpy_rand_seed)
ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
flag_types_shapes = args.save_onnx or args.save_proto_types_shapes
flag_forward_ops = not (use_gpu and ndevices > 1)
with core.DeviceScope(device_opt):
dlrm = DLRM_Net(
m_spa,
ln_emb,
ln_bot,
ln_top,
args.arch_interaction_op,
arch_interaction_itself=args.arch_interaction_itself,
sigmoid_bot=-1,
sigmoid_top=ln_top.size - 1,
save_onnx=flag_types_shapes,
ndevices=ndevices,
# forward_ops = flag_forward_ops
enable_prof=args.enable_profiling,
weighted_pooling=args.weighted_pooling,
emb_optimizer=args.optimizer,
)
# load nccl if using multiple devices
if args.sync_dense_params and ndevices > 1:
dyndep.InitOpsLibrary("//caffe2/caffe2/contrib/nccl:nccl_ops")
# set the net type for better performance (dag, async_scheduling, etc)
if args.caffe2_net_type:
dlrm.parameters().net.Proto().type = args.caffe2_net_type
# plot compute graph
if args.plot_compute_graph:
graph = net_drawer.GetPydotGraph(
dlrm.parameters().net, "dlrm_s_caffe2_graph", "BT"
)
graph.write_pdf(graph.get_name() + ".pdf")
# test prints
if args.debug_mode:
print("initial parameters (weights and bias):")
dlrm.print_weights()
# add training loss if needed
if not args.inference_only:
with core.DeviceScope(device_opt):
# specify the loss function
nd = 1.0 if dlrm.ndevices <= 1 else 1.0 / dlrm.ndevices # 1
if args.loss_function == "mse":
dlrm.MSEloss(scale=nd)
elif args.loss_function == "bce":
dlrm.BCEloss(scale=nd, threshold=args.loss_threshold)
else:
sys.exit(
"ERROR: --loss-function=" + args.loss_function + " is not supported"
)
# define test net (as train net without gradients)
dlrm.test_net = core.Net(copy.deepcopy(dlrm.model.net.Proto()))
# specify the optimizer algorithm
if args.optimizer == "sgd":
dlrm.sgd_optimizer(
args.learning_rate, sync_dense_params=args.sync_dense_params
)
elif args.optimizer in ["adagrad", "rwsadagrad"]:
dlrm.adagrad_optimizer(
args.learning_rate, sync_dense_params=args.sync_dense_params
)
else:
sys.exit(
"""ERROR: Select an optimizer for
embedding tables : 'sgd', 'adagrad',
or 'rwsadagrad' """
)
# init/create
X, lS_l, lS_i, T = next(
iter(train_ld)
) # does not affect the enumerate(train_ld) in the main loop
dlrm.create(X, lS_l, lS_i, T.int())
### main loop ###
best_gA_test = 0
best_auc_test = 0
total_time = 0
total_loss = 0
total_accu = 0
total_iter = 0
total_samp = 0
k = 0
print("time/loss/accuracy (if enabled):")
while k < args.nepochs:
j = 0
for j, inputBatch in enumerate(train_ld):
# forward and backward pass, where the latter runs only
# when gradients and loss have been added to the net
time1 = time.time()
lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
lT_j = lT_j.int() if args.loss_function == "bce" else lT_j
dlrm.run(lX_j, lS_l_j, lS_i_j, lT_j)
time2 = time.time()
total_time += time2 - time1
# compte loss and accuracy
Z = dlrm.get_output() # numpy array
T = lT_j.numpy()
"""
# debug prints
print("output and loss")
print(Z)
print(dlrm.get_loss())
"""
mbs = T.shape[0] # = args.mini_batch_size except maybe for last
A = np.sum((np.round(Z, 0) == T).astype(np.uint8))
total_accu += 0 if args.inference_only else A
total_loss += 0 if args.inference_only else dlrm.get_loss() * mbs
total_iter += 1
total_samp += mbs
# print time, loss and accuracy
should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
should_test = (
(args.test_freq > 0)
and (args.data_generation in ["dataset", "random"])
and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
)
if should_print or should_test:
gT = 1000.0 * total_time / total_iter if args.print_time else -1
total_time = 0
gA = total_accu / total_samp
total_accu = 0
gL = total_loss / total_samp
total_loss = 0
str_run_type = "inference" if args.inference_only else "training"
print(
"Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
str_run_type, j + 1, nbatches, k, gT
)
+ " loss {:.6f}".format(gL)
)
total_iter = 0
total_samp = 0
# debug prints
# print(Z)
# print(T)
# testing
if should_test and not args.inference_only:
# don't measure training iter time in a test iteration
if args.mlperf_logging:
previous_iteration_time = None
test_accu = 0
test_loss = 0
test_samp = 0
if args.mlperf_logging:
scores = []
targets = []
for i, testBatch in enumerate(test_ld):
# early exit if nbatches was set by the user and was exceeded
if nbatches > 0 and i >= nbatches:
break
# forward pass
lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i = testBatch
lT_test_i = (
lT_test_i.int()
if args.loss_function == "bce"
else lT_test_i
)
dlrm.run(
lX_test_i,
lS_l_test_i,
lS_i_test_i,
lT_test_i,
test_net=True,
)
Z_test = dlrm.get_output()
T_test = lT_test_i.numpy()
if args.mlperf_logging:
scores.append(Z_test)
targets.append(T_test)
else:
# compte loss and accuracy
L_test = dlrm.get_loss()
mbs_test = T_test.shape[0] # = mini_batch_size except last
A_test = np.sum(
(np.round(Z_test, 0) == T_test).astype(np.uint8)
)
test_accu += A_test
test_loss += L_test * mbs_test
test_samp += mbs_test
# compute metrics (after test loop has finished)
if args.mlperf_logging:
validation_results = calculate_metrics(targets, scores)
gA_test = validation_results["accuracy"]
gL_test = validation_results["loss"]
else:
gA_test = test_accu / test_samp
gL_test = test_loss / test_samp
# print metrics
is_best = gA_test > best_gA_test
if is_best:
best_gA_test = gA_test
if args.mlperf_logging:
is_best = validation_results["roc_auc"] > best_auc_test
if is_best:
best_auc_test = validation_results["roc_auc"]
print(
"Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+ " loss {:.6f}, recall {:.4f}, precision {:.4f},".format(
validation_results["loss"],
validation_results["recall"],
validation_results["precision"],
)
+ " f1 {:.4f}, ap {:.4f},".format(
validation_results["f1"],
validation_results["ap"],
)
+ " auc {:.4f}, best auc {:.4f},".format(
validation_results["roc_auc"], best_auc_test
)
+ " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
validation_results["accuracy"] * 100, best_gA_test * 100
)
)
else:
print(
"Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
+ " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
gL_test, gA_test * 100, best_gA_test * 100
)
)
# check thresholds
if (
args.mlperf_logging
and (args.mlperf_acc_threshold > 0)
and (best_gA_test > args.mlperf_acc_threshold)
):
print(
"MLPerf testing accuracy threshold "
+ str(args.mlperf_acc_threshold)
+ " reached, stop training"
)
break
if (
args.mlperf_logging
and (args.mlperf_auc_threshold > 0)
and (best_auc_test > args.mlperf_auc_threshold)
):
print(
"MLPerf testing auc threshold "
+ str(args.mlperf_auc_threshold)
+ " reached, stop training"
)
break
j += 1 # nbatches
k += 1 # nepochs
# test prints
if not args.inference_only and args.debug_mode:
print("updated parameters (weights and bias):")
dlrm.print_weights()
# build onnx model from caffe2
if args.save_onnx:
pnet = dlrm.parameters().net.Proto()
inet = dlrm.parameters().param_init_net.Proto()
value_info = dlrm.onnx_tsd # None
# debug prints
# print(value_info)
# WARNING: Why Caffe2 to ONNX net transformation currently does not work?
# 1. ONNX does not support SparseLengthsSum operator directly. A workaround
# could be for the Caffe2 ONNX frontend to indirectly map this operator to
# Gather and ReducedSum ONNX operators, following the PyTorch approach.
c2f = caffe2.python.onnx.frontend.Caffe2Frontend()
dlrm_caffe2_onnx = c2f.caffe2_net_to_onnx_model(pnet, inet, value_info)
# check the onnx model
onnx.checker.check_model(dlrm_caffe2_onnx)
# save model to a file
with open("dlrm_s_caffe2.onnx", "w+") as dlrm_caffe2_onnx_file:
dlrm_caffe2_onnx_file.write(str(dlrm_caffe2_onnx))
# build protobuf with types and shapes
if args.save_proto_types_shapes:
# add types and shapes to protobuf
__TYPE_MAPPING = {
onnx.TensorProto.FLOAT: caffe2_pb2.TensorProto.FLOAT,
onnx.TensorProto.UINT8: caffe2_pb2.TensorProto.UINT8,
onnx.TensorProto.INT8: caffe2_pb2.TensorProto.INT8,
onnx.TensorProto.UINT16: caffe2_pb2.TensorProto.UINT16,
onnx.TensorProto.INT16: caffe2_pb2.TensorProto.INT16,
onnx.TensorProto.INT32: caffe2_pb2.TensorProto.INT32,
onnx.TensorProto.INT64: caffe2_pb2.TensorProto.INT64,
onnx.TensorProto.STRING: caffe2_pb2.TensorProto.STRING,
onnx.TensorProto.BOOL: caffe2_pb2.TensorProto.BOOL,
onnx.TensorProto.FLOAT16: caffe2_pb2.TensorProto.FLOAT16,
onnx.TensorProto.DOUBLE: caffe2_pb2.TensorProto.DOUBLE,
}
pnet = dlrm.parameters().net.Proto()
arg = pnet.arg.add()
arg.name = "input_shape_info"
for i in pnet.external_input:
if i in dlrm.onnx_tsd:
onnx_dtype, shape = dlrm.onnx_tsd[i]
t = arg.tensors.add()
t.name = i
t.data_type = __TYPE_MAPPING[onnx_dtype]
t.dims.extend(shape)
else:
print("Warning: we don't have shape/type info for input: {}".format(i))
# debug print
# print(pnet)
# export the protobuf with types and shapes
with open("dlrm_s_caffe2.proto", "w+") as dlrm_s_proto_file:
dlrm_s_proto_file.write(str(pnet))
"""
# export the protobuf with types and shapes as well as weights
# see https://github.com/pytorch/pytorch/issues/9533
#save
net = dlrm.parameters().net
params = dlrm.parameters().params
init_net, predict_net = mobile_exporter.Export(workspace, net, params)
with open("dlrm_s_caffe2.predict", "wb") as dlrm_s_predict_file:
dlrm_s_predict_file.write(predict_net.SerializeToString())
with open("dlrm_s_caffe2.init", "wb") as dlrm_s_init_file:
dlrm_s_init_file.write(init_net.SerializeToString())
#load
net_def = caffe2_pb2.NetDef()
init_def= caffe2_pb2.NetDef()
with open("dlrm_s_caffe2.predict", "rb") as dlrm_s_predict_file:
net_def.ParseFromString(dlrm_s_predict_file.read())
print(net_def)
with open("dlrm_s_caffe2.init", "rb") as dlrm_s_init_file:
init_def.ParseFromString(dlrm_s_init_file.read())
print(init_def)
"""
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Description: an implementation of a deep learning recommendation model (DLRM)
# The model input consists of dense and sparse features. The former is a vector
# of floating point values. The latter is a list of sparse indices into
# embedding tables, which consist of vectors of floating point values.
# The selected vectors are passed to mlp networks denoted by triangles,
# in some cases the vectors are interacted through operators (Ops).
#
# output:
# vector of values
# model: |
# /\
# /__\
# |
# _____________________> Op <___________________
# / | \
# /\ /\ /\
# /__\ /__\ ... /__\
# | | |
# | Op Op
# | ____/__\_____ ____/__\____
# | |_Emb_|____|__| ... |_Emb_|__|___|
# input:
# [ dense features ] [sparse indices] , ..., [sparse indices]
#
# More precise definition of model layers:
# 1) fully connected layers of an mlp
# z = f(y)
# y = Wx + b
#
# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
# z = Op(e1,...,ek)
# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
#
# 3) Operator Op can be one of the following
# Sum(e1,...,ek) = e1 + ... + ek
# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
# Cat(e1,...,ek) = [e1', ..., ek']'
# where ' denotes transpose operation
#
# References:
# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
# miscellaneous
import builtins
import datetime
import json
import sys
import time
# onnx
# The onnx import causes deprecation warnings every time workers
# are spawned during testing. So, we filter out those warnings.
import warnings
# data generation
import dlrm_data_pytorch as dp
# For distributed run
import extend_distributed as ext_dist
import mlperf_logger
# numpy
import numpy as np
import optim.rwsadagrad as RowWiseSparseAdagrad
import sklearn.metrics
# pytorch
import torch
import torch.nn as nn
# dataloader
try:
from internals import fbDataLoader, fbInputBatchFormatter
has_internal_libs = True
except ImportError:
has_internal_libs = False
from torch._ops import ops
from torch.autograd.profiler import record_function
from torch.nn.parallel.parallel_apply import parallel_apply
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.scatter_gather import gather, scatter
from torch.nn.parameter import Parameter
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.tensorboard import SummaryWriter
# mixed-dimension trick
from tricks.md_embedding_bag import md_solver, PrEmbeddingBag
# quotient-remainder trick
from tricks.qr_embedding_bag import QREmbeddingBag
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
try:
import onnx
except ImportError as error:
print("Unable to import onnx. ", error)
# from torchviz import make_dot
# import torch.nn.functional as Functional
# from torch.nn.parameter import Parameter
exc = getattr(builtins, "IOError", "FileNotFoundError")
def time_wrap(use_gpu):
if use_gpu:
torch.cuda.synchronize()
return time.time()
def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1):
with record_function("DLRM forward"):
if use_gpu: # .cuda()
# lS_i can be either a list of tensors or a stacked tensor.
# Handle each case below:
if ndevices == 1:
lS_i = (
[S_i.to(device) for S_i in lS_i]
if isinstance(lS_i, list)
else lS_i.to(device)
)
lS_o = (
[S_o.to(device) for S_o in lS_o]
if isinstance(lS_o, list)
else lS_o.to(device)
)
return dlrm(X.to(device), lS_o, lS_i)
def loss_fn_wrap(Z, T, use_gpu, device):
with record_function("DLRM loss compute"):
if args.loss_function == "mse" or args.loss_function == "bce":
return dlrm.loss_fn(Z, T.to(device))
elif args.loss_function == "wbce":
loss_ws_ = dlrm.loss_ws[T.data.view(-1).long()].view_as(T).to(device)
loss_fn_ = dlrm.loss_fn(Z, T.to(device))
loss_sc_ = loss_ws_ * loss_fn_
return loss_sc_.mean()
# The following function is a wrapper to avoid checking this multiple times in th
# loop below.
def unpack_batch(b):
if args.data_generation == "internal":
return fbInputBatchFormatter(b, args.data_size)
else:
# Experiment with unweighted samples
return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None
class LRPolicyScheduler(_LRScheduler):
def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
self.num_warmup_steps = num_warmup_steps
self.decay_start_step = decay_start_step
self.decay_end_step = decay_start_step + num_decay_steps
self.num_decay_steps = num_decay_steps
if self.decay_start_step < self.num_warmup_steps:
sys.exit("Learning rate warmup must finish before the decay starts")
super(LRPolicyScheduler, self).__init__(optimizer)
def get_lr(self):
step_count = self._step_count
if step_count < self.num_warmup_steps:
# warmup
scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
lr = [base_lr * scale for base_lr in self.base_lrs]
self.last_lr = lr
elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
# decay
decayed_steps = step_count - self.decay_start_step
scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
min_lr = 0.0000001
lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
self.last_lr = lr
else:
if self.num_decay_steps > 0:
# freeze at last, either because we're after decay
# or because we're between warmup and decay
lr = self.last_lr
else:
# do not adjust
lr = self.base_lrs
return lr
### define dlrm in PyTorch ###
class DLRM_Net(nn.Module):
def create_mlp(self, ln, sigmoid_layer):
# build MLP layer by layer
layers = nn.ModuleList()
for i in range(0, ln.size - 1):
n = ln[i]
m = ln[i + 1]
# construct fully connected operator
LL = nn.Linear(int(n), int(m), bias=True)
# initialize the weights
# with torch.no_grad():
# custom Xavier input, output or two-sided fill
mean = 0.0 # std_dev = np.sqrt(variance)
std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n)
W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1))
bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
# approach 1
LL.weight.data = torch.tensor(W, requires_grad=True)
LL.bias.data = torch.tensor(bt, requires_grad=True)
# approach 2
# LL.weight.data.copy_(torch.tensor(W))
# LL.bias.data.copy_(torch.tensor(bt))
# approach 3
# LL.weight = Parameter(torch.tensor(W),requires_grad=True)
# LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
layers.append(LL)
# construct sigmoid or relu operator
if i == sigmoid_layer:
layers.append(nn.Sigmoid())
else:
layers.append(nn.ReLU())
# approach 1: use ModuleList
# return layers
# approach 2: use Sequential container to wrap all layers
return torch.nn.Sequential(*layers)
def create_emb(self, m, ln, weighted_pooling=None):
emb_l = nn.ModuleList()
v_W_l = []
for i in range(0, ln.size):
if ext_dist.my_size > 1:
if i not in self.local_emb_indices:
continue
n = ln[i]
# construct embedding operator
if self.qr_flag and n > self.qr_threshold:
EE = QREmbeddingBag(
n,
m,
self.qr_collisions,
operation=self.qr_operation,
mode="sum",
sparse=True,
)
elif self.md_flag and n > self.md_threshold:
base = max(m)
_m = m[i] if n > self.md_threshold else base
EE = PrEmbeddingBag(n, _m, base)
# use np initialization as below for consistency...
W = np.random.uniform(
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
).astype(np.float32)
EE.embs.weight.data = torch.tensor(W, requires_grad=True)
else:
EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
# initialize embeddings
# nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
W = np.random.uniform(
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
).astype(np.float32)
# approach 1
EE.weight.data = torch.tensor(W, requires_grad=True)
# approach 2
# EE.weight.data.copy_(torch.tensor(W))
# approach 3
# EE.weight = Parameter(torch.tensor(W),requires_grad=True)
if weighted_pooling is None:
v_W_l.append(None)
else:
v_W_l.append(torch.ones(n, dtype=torch.float32))
emb_l.append(EE)
return emb_l, v_W_l
def __init__(
self,
m_spa=None,
ln_emb=None,
ln_bot=None,
ln_top=None,
arch_interaction_op=None,
arch_interaction_itself=False,
sigmoid_bot=-1,
sigmoid_top=-1,
sync_dense_params=True,
loss_threshold=0.0,
ndevices=-1,
qr_flag=False,
qr_operation="mult",
qr_collisions=0,
qr_threshold=200,
md_flag=False,
md_threshold=200,
weighted_pooling=None,
loss_function="bce",
):
super(DLRM_Net, self).__init__()
if (
(m_spa is not None)
and (ln_emb is not None)
and (ln_bot is not None)
and (ln_top is not None)
and (arch_interaction_op is not None)
):
# save arguments
self.ndevices = ndevices
self.output_d = 0
self.parallel_model_batch_size = -1
self.parallel_model_is_not_prepared = True
self.arch_interaction_op = arch_interaction_op
self.arch_interaction_itself = arch_interaction_itself
self.sync_dense_params = sync_dense_params
self.loss_threshold = loss_threshold
self.loss_function = loss_function
if weighted_pooling is not None and weighted_pooling != "fixed":
self.weighted_pooling = "learned"
else:
self.weighted_pooling = weighted_pooling
# create variables for QR embedding if applicable
self.qr_flag = qr_flag
if self.qr_flag:
self.qr_collisions = qr_collisions
self.qr_operation = qr_operation
self.qr_threshold = qr_threshold
# create variables for MD embedding if applicable
self.md_flag = md_flag
if self.md_flag:
self.md_threshold = md_threshold
# If running distributed, get local slice of embedding tables
if ext_dist.my_size > 1:
n_emb = len(ln_emb)
if n_emb < ext_dist.my_size:
sys.exit(
"only (%d) sparse features for (%d) devices, table partitions will fail"
% (n_emb, ext_dist.my_size)
)
self.n_global_emb = n_emb
self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(
n_emb
)
self.local_emb_slice = ext_dist.get_my_slice(n_emb)
self.local_emb_indices = list(range(n_emb))[self.local_emb_slice]
# create operators
if ndevices <= 1:
self.emb_l, w_list = self.create_emb(m_spa, ln_emb, weighted_pooling)
if self.weighted_pooling == "learned":
self.v_W_l = nn.ParameterList()
for w in w_list:
self.v_W_l.append(Parameter(w))
else:
self.v_W_l = w_list
self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
self.top_l = self.create_mlp(ln_top, sigmoid_top)
# quantization
self.quantize_emb = False
self.emb_l_q = []
self.quantize_bits = 32
# specify the loss function
if self.loss_function == "mse":
self.loss_fn = torch.nn.MSELoss(reduction="mean")
elif self.loss_function == "bce":
self.loss_fn = torch.nn.BCELoss(reduction="mean")
elif self.loss_function == "wbce":
self.loss_ws = torch.tensor(
np.fromstring(args.loss_weights, dtype=float, sep="-")
)
self.loss_fn = torch.nn.BCELoss(reduction="none")
else:
sys.exit(
"ERROR: --loss-function=" + self.loss_function + " is not supported"
)
def apply_mlp(self, x, layers):
# approach 1: use ModuleList
# for layer in layers:
# x = layer(x)
# return x
# approach 2: use Sequential container to wrap all layers
return layers(x)
def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
# WARNING: notice that we are processing the batch at once. We implicitly
# assume that the data is laid out such that:
# 1. each embedding is indexed with a group of sparse indices,
# corresponding to a single lookup
# 2. for each embedding the lookups are further organized into a batch
# 3. for a list of embedding tables there is a list of batched lookups
ly = []
for k, sparse_index_group_batch in enumerate(lS_i):
sparse_offset_group_batch = lS_o[k]
# embedding lookup
# We are using EmbeddingBag, which implicitly uses sum operator.
# The embeddings are represented as tall matrices, with sum
# happening vertically across 0 axis, resulting in a row vector
# E = emb_l[k]
if v_W_l[k] is not None:
per_sample_weights = v_W_l[k].gather(0, sparse_index_group_batch)
else:
per_sample_weights = None
if self.quantize_emb:
s1 = self.emb_l_q[k].element_size() * self.emb_l_q[k].nelement()
s2 = self.emb_l_q[k].element_size() * self.emb_l_q[k].nelement()
print("quantized emb sizes:", s1, s2)
if self.quantize_bits == 4:
QV = ops.quantized.embedding_bag_4bit_rowwise_offsets(
self.emb_l_q[k],
sparse_index_group_batch,
sparse_offset_group_batch,
per_sample_weights=per_sample_weights,
)
elif self.quantize_bits == 8:
QV = ops.quantized.embedding_bag_byte_rowwise_offsets(
self.emb_l_q[k],
sparse_index_group_batch,
sparse_offset_group_batch,
per_sample_weights=per_sample_weights,
)
ly.append(QV)
else:
E = emb_l[k]
V = E(
sparse_index_group_batch,
sparse_offset_group_batch,
per_sample_weights=per_sample_weights,
)
ly.append(V)
# print(ly)
return ly
# using quantizing functions from caffe2/aten/src/ATen/native/quantized/cpu
def quantize_embedding(self, bits):
n = len(self.emb_l)
self.emb_l_q = [None] * n
for k in range(n):
if bits == 4:
self.emb_l_q[k] = ops.quantized.embedding_bag_4bit_prepack(
self.emb_l[k].weight
)
elif bits == 8:
self.emb_l_q[k] = ops.quantized.embedding_bag_byte_prepack(
self.emb_l[k].weight
)
else:
return
self.emb_l = None
self.quantize_emb = True
self.quantize_bits = bits
def interact_features(self, x, ly):
if self.arch_interaction_op == "dot":
# concatenate dense and sparse features
(batch_size, d) = x.shape
T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
# perform a dot product
Z = torch.bmm(T, torch.transpose(T, 1, 2))
# append dense feature with the interactions (into a row vector)
# approach 1: all
# Zflat = Z.view((batch_size, -1))
# approach 2: unique
_, ni, nj = Z.shape
# approach 1: tril_indices
# offset = 0 if self.arch_interaction_itself else -1
# li, lj = torch.tril_indices(ni, nj, offset=offset)
# approach 2: custom
offset = 1 if self.arch_interaction_itself else 0
li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
Zflat = Z[:, li, lj]
# concatenate dense features and interactions
R = torch.cat([x] + [Zflat], dim=1)
elif self.arch_interaction_op == "cat":
# concatenation features (into a row vector)
R = torch.cat([x] + ly, dim=1)
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ self.arch_interaction_op
+ " is not supported"
)
return R
def forward(self, dense_x, lS_o, lS_i):
if ext_dist.my_size > 1:
# multi-node multi-device run
return self.distributed_forward(dense_x, lS_o, lS_i)
elif self.ndevices <= 1:
# single device run
return self.sequential_forward(dense_x, lS_o, lS_i)
else:
# single-node multi-device run
return self.parallel_forward(dense_x, lS_o, lS_i)
def distributed_forward(self, dense_x, lS_o, lS_i):
batch_size = dense_x.size()[0]
# WARNING: # of ranks must be <= batch size in distributed_forward call
if batch_size < ext_dist.my_size:
sys.exit(
"ERROR: batch_size (%d) must be larger than number of ranks (%d)"
% (batch_size, ext_dist.my_size)
)
if batch_size % ext_dist.my_size != 0:
sys.exit(
"ERROR: batch_size %d can not split across %d ranks evenly"
% (batch_size, ext_dist.my_size)
)
dense_x = dense_x[ext_dist.get_my_slice(batch_size)]
lS_o = lS_o[self.local_emb_slice]
lS_i = lS_i[self.local_emb_slice]
if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
sys.exit(
"ERROR: corrupted model input detected in distributed_forward call"
)
# embeddings
with record_function("DLRM embedding forward"):
ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
# WARNING: Note that at this point we have the result of the embedding lookup
# for the entire batch on each rank. We would like to obtain partial results
# corresponding to all embedding lookups, but part of the batch on each rank.
# Therefore, matching the distribution of output of bottom mlp, so that both
# could be used for subsequent interactions on each device.
if len(self.emb_l) != len(ly):
sys.exit("ERROR: corrupted intermediate result in distributed_forward call")
a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
with record_function("DLRM bottom nlp forward"):
x = self.apply_mlp(dense_x, self.bot_l)
ly = a2a_req.wait()
ly = list(ly)
# interactions
with record_function("DLRM interaction forward"):
z = self.interact_features(x, ly)
# top mlp
with record_function("DLRM top nlp forward"):
p = self.apply_mlp(z, self.top_l)
# clamp output if needed
if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
else:
z = p
return z
def sequential_forward(self, dense_x, lS_o, lS_i):
# process dense features (using bottom mlp), resulting in a row vector
x = self.apply_mlp(dense_x, self.bot_l)
# debug prints
# print("intermediate")
# print(x.detach().cpu().numpy())
# process sparse features(using embeddings), resulting in a list of row vectors
ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
# for y in ly:
# print(y.detach().cpu().numpy())
# interact features (dense and sparse)
z = self.interact_features(x, ly)
# print(z.detach().cpu().numpy())
# obtain probability of a click (using top mlp)
p = self.apply_mlp(z, self.top_l)
# clamp output if needed
if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
else:
z = p
return z
def parallel_forward(self, dense_x, lS_o, lS_i):
### prepare model (overwrite) ###
# WARNING: # of devices must be >= batch size in parallel_forward call
batch_size = dense_x.size()[0]
ndevices = min(self.ndevices, batch_size, len(self.emb_l))
device_ids = range(ndevices)
# WARNING: must redistribute the model if mini-batch size changes(this is common
# for last mini-batch, when # of elements in the dataset/batch size is not even
if self.parallel_model_batch_size != batch_size:
self.parallel_model_is_not_prepared = True
if self.parallel_model_is_not_prepared or self.sync_dense_params:
# replicate mlp (data parallelism)
self.bot_l_replicas = replicate(self.bot_l, device_ids)
self.top_l_replicas = replicate(self.top_l, device_ids)
self.parallel_model_batch_size = batch_size
if self.parallel_model_is_not_prepared:
# distribute embeddings (model parallelism)
t_list = []
w_list = []
for k, emb in enumerate(self.emb_l):
d = torch.device("cuda:" + str(k % ndevices))
t_list.append(emb.to(d))
if self.weighted_pooling == "learned":
w_list.append(Parameter(self.v_W_l[k].to(d)))
elif self.weighted_pooling == "fixed":
w_list.append(self.v_W_l[k].to(d))
else:
w_list.append(None)
self.emb_l = nn.ModuleList(t_list)
if self.weighted_pooling == "learned":
self.v_W_l = nn.ParameterList(w_list)
else:
self.v_W_l = w_list
self.parallel_model_is_not_prepared = False
### prepare input (overwrite) ###
# scatter dense features (data parallelism)
# print(dense_x.device)
dense_x = scatter(dense_x, device_ids, dim=0)
# distribute sparse features (model parallelism)
if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
sys.exit("ERROR: corrupted model input detected in parallel_forward call")
t_list = []
i_list = []
for k, _ in enumerate(self.emb_l):
d = torch.device("cuda:" + str(k % ndevices))
t_list.append(lS_o[k].to(d))
i_list.append(lS_i[k].to(d))
lS_o = t_list
lS_i = i_list
### compute results in parallel ###
# bottom mlp
# WARNING: Note that the self.bot_l is a list of bottom mlp modules
# that have been replicated across devices, while dense_x is a tuple of dense
# inputs that has been scattered across devices on the first (batch) dimension.
# The output is a list of tensors scattered across devices according to the
# distribution of dense_x.
x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
# debug prints
# print(x)
# embeddings
ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
# debug prints
# print(ly)
# butterfly shuffle (implemented inefficiently for now)
# WARNING: Note that at this point we have the result of the embedding lookup
# for the entire batch on each device. We would like to obtain partial results
# corresponding to all embedding lookups, but part of the batch on each device.
# Therefore, matching the distribution of output of bottom mlp, so that both
# could be used for subsequent interactions on each device.
if len(self.emb_l) != len(ly):
sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
t_list = []
for k, _ in enumerate(self.emb_l):
d = torch.device("cuda:" + str(k % ndevices))
y = scatter(ly[k], device_ids, dim=0)
t_list.append(y)
# adjust the list to be ordered per device
ly = list(map(lambda y: list(y), zip(*t_list)))
# debug prints
# print(ly)
# interactions
z = []
for k in range(ndevices):
zk = self.interact_features(x[k], ly[k])
z.append(zk)
# debug prints
# print(z)
# top mlp
# WARNING: Note that the self.top_l is a list of top mlp modules that
# have been replicated across devices, while z is a list of interaction results
# that by construction are scattered across devices on the first (batch) dim.
# The output is a list of tensors scattered across devices according to the
# distribution of z.
p = parallel_apply(self.top_l_replicas, z, None, device_ids)
### gather the distributed results ###
p0 = gather(p, self.output_d, dim=0)
# clamp output if needed
if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
z0 = torch.clamp(
p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
)
else:
z0 = p0
return z0
def dash_separated_ints(value):
vals = value.split("-")
for val in vals:
try:
int(val)
except ValueError:
raise argparse.ArgumentTypeError(
"%s is not a valid dash separated list of ints" % value
)
return value
def dash_separated_floats(value):
vals = value.split("-")
for val in vals:
try:
float(val)
except ValueError:
raise argparse.ArgumentTypeError(
"%s is not a valid dash separated list of floats" % value
)
return value
def inference(
args,
dlrm,
best_acc_test,
best_auc_test,
test_ld,
device,
use_gpu,
log_iter=-1,
):
test_accu = 0
test_samp = 0
if args.mlperf_logging:
scores = []
targets = []
for i, testBatch in enumerate(test_ld):
# early exit if nbatches was set by the user and was exceeded
if nbatches > 0 and i >= nbatches:
break
X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch(
testBatch
)
# Skip the batch if batch size not multiple of total ranks
if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0:
print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
continue
# forward pass
Z_test = dlrm_wrap(
X_test,
lS_o_test,
lS_i_test,
use_gpu,
device,
ndevices=ndevices,
)
### gather the distributed results on each rank ###
# For some reason it requires explicit sync before all_gather call if
# tensor is on GPU memory
if Z_test.is_cuda:
torch.cuda.synchronize()
(_, batch_split_lengths) = ext_dist.get_split_lengths(X_test.size(0))
if ext_dist.my_size > 1:
Z_test = ext_dist.all_gather(Z_test, batch_split_lengths)
if args.mlperf_logging:
S_test = Z_test.detach().cpu().numpy() # numpy array
T_test = T_test.detach().cpu().numpy() # numpy array
scores.append(S_test)
targets.append(T_test)
else:
with record_function("DLRM accuracy compute"):
# compute loss and accuracy
S_test = Z_test.detach().cpu().numpy() # numpy array
T_test = T_test.detach().cpu().numpy() # numpy array
mbs_test = T_test.shape[0] # = mini_batch_size except last
A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
test_accu += A_test
test_samp += mbs_test
if args.mlperf_logging:
with record_function("DLRM mlperf sklearn metrics compute"):
scores = np.concatenate(scores, axis=0)
targets = np.concatenate(targets, axis=0)
metrics = {
"recall": lambda y_true, y_score: sklearn.metrics.recall_score(
y_true=y_true, y_pred=np.round(y_score)
),
"precision": lambda y_true, y_score: sklearn.metrics.precision_score(
y_true=y_true, y_pred=np.round(y_score)
),
"f1": lambda y_true, y_score: sklearn.metrics.f1_score(
y_true=y_true, y_pred=np.round(y_score)
),
"ap": sklearn.metrics.average_precision_score,
"roc_auc": sklearn.metrics.roc_auc_score,
"accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
y_true=y_true, y_pred=np.round(y_score)
),
}
validation_results = {}
for metric_name, metric_function in metrics.items():
validation_results[metric_name] = metric_function(targets, scores)
writer.add_scalar(
"mlperf-metrics-test/" + metric_name,
validation_results[metric_name],
log_iter,
)
acc_test = validation_results["accuracy"]
else:
acc_test = test_accu / test_samp
writer.add_scalar("Test/Acc", acc_test, log_iter)
model_metrics_dict = {
"nepochs": args.nepochs,
"nbatches": nbatches,
"nbatches_test": nbatches_test,
"state_dict": dlrm.state_dict(),
"test_acc": acc_test,
}
if args.mlperf_logging:
is_best = validation_results["roc_auc"] > best_auc_test
if is_best:
best_auc_test = validation_results["roc_auc"]
model_metrics_dict["test_auc"] = best_auc_test
print(
"recall {:.4f}, precision {:.4f},".format(
validation_results["recall"],
validation_results["precision"],
)
+ " f1 {:.4f}, ap {:.4f},".format(
validation_results["f1"], validation_results["ap"]
)
+ " auc {:.4f}, best auc {:.4f},".format(
validation_results["roc_auc"], best_auc_test
)
+ " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
validation_results["accuracy"] * 100, best_acc_test * 100
),
flush=True,
)
else:
is_best = acc_test > best_acc_test
if is_best:
best_acc_test = acc_test
print(
" accuracy {:3.3f} %, best {:3.3f} %".format(
acc_test * 100, best_acc_test * 100
),
flush=True,
)
return model_metrics_dict, is_best
def run():
### parse arguments ###
parser = argparse.ArgumentParser(
description="Train Deep Learning Recommendation Model (DLRM)"
)
# model related parameters
parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
parser.add_argument(
"--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
)
# j will be replaced with the table number
parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
parser.add_argument(
"--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
)
parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
parser.add_argument("--weighted-pooling", type=str, default=None)
# embedding table options
parser.add_argument("--md-flag", action="store_true", default=False)
parser.add_argument("--md-threshold", type=int, default=200)
parser.add_argument("--md-temperature", type=float, default=0.3)
parser.add_argument("--md-round-dims", action="store_true", default=False)
parser.add_argument("--qr-flag", action="store_true", default=False)
parser.add_argument("--qr-threshold", type=int, default=200)
parser.add_argument("--qr-operation", type=str, default="mult")
parser.add_argument("--qr-collisions", type=int, default=4)
# activations and loss
parser.add_argument("--activation-function", type=str, default="relu")
parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce
parser.add_argument(
"--loss-weights", type=dash_separated_floats, default="1.0-1.0"
) # for wbce
parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7
parser.add_argument("--round-targets", type=bool, default=False)
# data
parser.add_argument("--data-size", type=int, default=1)
parser.add_argument("--num-batches", type=int, default=0)
parser.add_argument(
"--data-generation",
type=str,
choices=["random", "dataset", "internal"],
default="random",
) # synthetic, dataset or internal
parser.add_argument(
"--rand-data-dist", type=str, default="uniform"
) # uniform or gaussian
parser.add_argument("--rand-data-min", type=float, default=0)
parser.add_argument("--rand-data-max", type=float, default=1)
parser.add_argument("--rand-data-mu", type=float, default=-1)
parser.add_argument("--rand-data-sigma", type=float, default=1)
parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
parser.add_argument("--raw-data-file", type=str, default="")
parser.add_argument("--processed-data-file", type=str, default="")
parser.add_argument("--data-randomize", type=str, default="total") # or day or none
parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
parser.add_argument("--max-ind-range", type=int, default=-1)
parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
parser.add_argument("--num-indices-per-lookup", type=int, default=10)
parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
parser.add_argument("--num-workers", type=int, default=0)
parser.add_argument("--memory-map", action="store_true", default=False)
# training
parser.add_argument("--mini-batch-size", type=int, default=1)
parser.add_argument("--nepochs", type=int, default=1)
parser.add_argument("--learning-rate", type=float, default=0.01)
parser.add_argument("--print-precision", type=int, default=5)
parser.add_argument("--numpy-rand-seed", type=int, default=123)
parser.add_argument("--sync-dense-params", type=bool, default=True)
parser.add_argument("--optimizer", type=str, default="sgd")
parser.add_argument(
"--dataset-multiprocessing",
action="store_true",
default=False,
help="The Kaggle dataset can be multiprocessed in an environment \
with more than 7 CPU cores and more than 20 GB of memory. \n \
The Terabyte dataset can be multiprocessed in an environment \
with more than 24 CPU cores and at least 1 TB of memory.",
)
# inference
parser.add_argument("--inference-only", action="store_true", default=False)
# quantize
parser.add_argument("--quantize-mlp-with-bit", type=int, default=32)
parser.add_argument("--quantize-emb-with-bit", type=int, default=32)
# onnx
parser.add_argument("--save-onnx", action="store_true", default=False)
# gpu
parser.add_argument("--use-gpu", action="store_true", default=False)
# distributed
parser.add_argument("--local_rank", type=int, default=-1)
parser.add_argument("--dist-backend", type=str, default="")
# debugging and profiling
parser.add_argument("--print-freq", type=int, default=1)
parser.add_argument("--test-freq", type=int, default=-1)
parser.add_argument("--test-mini-batch-size", type=int, default=-1)
parser.add_argument("--test-num-workers", type=int, default=-1)
parser.add_argument("--print-time", action="store_true", default=False)
parser.add_argument("--print-wall-time", action="store_true", default=False)
parser.add_argument("--debug-mode", action="store_true", default=False)
parser.add_argument("--enable-profiling", action="store_true", default=False)
parser.add_argument("--plot-compute-graph", action="store_true", default=False)
parser.add_argument("--tensor-board-filename", type=str, default="run_kaggle_pt")
# store/load model
parser.add_argument("--save-model", type=str, default="")
parser.add_argument("--load-model", type=str, default="")
# mlperf logging (disables other output and stops early)
parser.add_argument("--mlperf-logging", action="store_true", default=False)
# stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
# stop at target AUC Terabyte (no subsampling) 0.8025
parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False)
# mlperf gradient accumulation iterations
parser.add_argument("--mlperf-grad-accum-iter", type=int, default=1)
# LR policy
parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
parser.add_argument("--lr-decay-start-step", type=int, default=0)
parser.add_argument("--lr-num-decay-steps", type=int, default=0)
global args
global nbatches
global nbatches_test
global writer
args = parser.parse_args()
if args.dataset_multiprocessing:
assert sys.version_info[0] >= 3 and sys.version_info[1] > 7, (
"The dataset_multiprocessing "
+ "flag is susceptible to a bug in Python 3.7 and under. "
+ "https://github.com/facebookresearch/dlrm/issues/172"
)
if args.mlperf_logging:
mlperf_logger.log_event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
mlperf_logger.log_start(
key=mlperf_logger.constants.INIT_START, log_all_ranks=True
)
if args.weighted_pooling is not None:
if args.qr_flag:
sys.exit("ERROR: quotient remainder with weighted pooling is not supported")
if args.md_flag:
sys.exit("ERROR: mixed dimensions with weighted pooling is not supported")
if args.quantize_emb_with_bit in [4, 8]:
if args.qr_flag:
sys.exit(
"ERROR: 4 and 8-bit quantization with quotient remainder is not supported"
)
if args.md_flag:
sys.exit(
"ERROR: 4 and 8-bit quantization with mixed dimensions is not supported"
)
if args.use_gpu:
sys.exit("ERROR: 4 and 8-bit quantization on GPU is not supported")
### some basic setup ###
np.random.seed(args.numpy_rand_seed)
np.set_printoptions(precision=args.print_precision)
torch.set_printoptions(precision=args.print_precision)
torch.manual_seed(args.numpy_rand_seed)
if args.test_mini_batch_size < 0:
# if the parameter is not set, use the training batch size
args.test_mini_batch_size = args.mini_batch_size
if args.test_num_workers < 0:
# if the parameter is not set, use the same parameter for training
args.test_num_workers = args.num_workers
use_gpu = args.use_gpu and torch.cuda.is_available()
if not args.debug_mode:
ext_dist.init_distributed(
local_rank=args.local_rank, use_gpu=use_gpu, backend=args.dist_backend
)
if use_gpu:
torch.cuda.manual_seed_all(args.numpy_rand_seed)
torch.backends.cudnn.deterministic = True
if ext_dist.my_size > 1:
ngpus = 1
device = torch.device("cuda", ext_dist.my_local_rank)
else:
ngpus = torch.cuda.device_count()
device = torch.device("cuda", 0)
print("Using {} GPU(s)...".format(ngpus))
else:
device = torch.device("cpu")
print("Using CPU...")
### prepare training data ###
ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
# input data
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP)
mlperf_logger.barrier()
mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START)
mlperf_logger.barrier()
if args.data_generation == "dataset":
train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
table_feature_map = {idx: idx for idx in range(len(train_data.counts))}
nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
nbatches_test = len(test_ld)
ln_emb = train_data.counts
# enforce maximum limit on number of vectors per embedding
if args.max_ind_range > 0:
ln_emb = np.array(
list(
map(
lambda x: x if x < args.max_ind_range else args.max_ind_range,
ln_emb,
)
)
)
else:
ln_emb = np.array(ln_emb)
m_den = train_data.m_den
ln_bot[0] = m_den
elif args.data_generation == "internal":
if not has_internal_libs:
raise Exception("Internal libraries are not available.")
NUM_BATCHES = 5000
nbatches = args.num_batches if args.num_batches > 0 else NUM_BATCHES
train_ld, feature_to_num_embeddings = fbDataLoader(args.data_size, nbatches)
ln_emb = np.array(list(feature_to_num_embeddings.values()))
m_den = ln_bot[0]
else:
# input and target at random
ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
m_den = ln_bot[0]
train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(
args, ln_emb, m_den
)
nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
nbatches_test = len(test_ld)
args.ln_emb = ln_emb.tolist()
if args.mlperf_logging:
print("command line args: ", json.dumps(vars(args)))
### parse command line arguments ###
m_spa = args.arch_sparse_feature_size
ln_emb = np.asarray(ln_emb)
num_fea = ln_emb.size + 1 # num sparse + num dense features
m_den_out = ln_bot[ln_bot.size - 1]
if args.arch_interaction_op == "dot":
# approach 1: all
# num_int = num_fea * num_fea + m_den_out
# approach 2: unique
if args.arch_interaction_itself:
num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
else:
num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
elif args.arch_interaction_op == "cat":
num_int = num_fea * m_den_out
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ args.arch_interaction_op
+ " is not supported"
)
arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
# sanity check: feature sizes and mlp dimensions must match
if m_den != ln_bot[0]:
sys.exit(
"ERROR: arch-dense-feature-size "
+ str(m_den)
+ " does not match first dim of bottom mlp "
+ str(ln_bot[0])
)
if args.qr_flag:
if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
sys.exit(
"ERROR: 2 arch-sparse-feature-size "
+ str(2 * m_spa)
+ " does not match last dim of bottom mlp "
+ str(m_den_out)
+ " (note that the last dim of bottom mlp must be 2x the embedding dim)"
)
if args.qr_operation != "concat" and m_spa != m_den_out:
sys.exit(
"ERROR: arch-sparse-feature-size "
+ str(m_spa)
+ " does not match last dim of bottom mlp "
+ str(m_den_out)
)
else:
if m_spa != m_den_out:
sys.exit(
"ERROR: arch-sparse-feature-size "
+ str(m_spa)
+ " does not match last dim of bottom mlp "
+ str(m_den_out)
)
if num_int != ln_top[0]:
sys.exit(
"ERROR: # of feature interactions "
+ str(num_int)
+ " does not match first dimension of top mlp "
+ str(ln_top[0])
)
# assign mixed dimensions if applicable
if args.md_flag:
m_spa = md_solver(
torch.tensor(ln_emb),
args.md_temperature, # alpha
d0=m_spa,
round_dim=args.md_round_dims,
).tolist()
# test prints (model arch)
if args.debug_mode:
print("model arch:")
print(
"mlp top arch "
+ str(ln_top.size - 1)
+ " layers, with input to output dimensions:"
)
print(ln_top)
print("# of interactions")
print(num_int)
print(
"mlp bot arch "
+ str(ln_bot.size - 1)
+ " layers, with input to output dimensions:"
)
print(ln_bot)
print("# of features (sparse and dense)")
print(num_fea)
print("dense feature size")
print(m_den)
print("sparse feature size")
print(m_spa)
print(
"# of embeddings (= # of sparse features) "
+ str(ln_emb.size)
+ ", with dimensions "
+ str(m_spa)
+ "x:"
)
print(ln_emb)
print("data (inputs and targets):")
for j, inputBatch in enumerate(train_ld):
X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
torch.set_printoptions(precision=4)
# early exit if nbatches was set by the user and has been exceeded
if nbatches > 0 and j >= nbatches:
break
print("mini-batch: %d" % j)
print(X.detach().cpu())
# transform offsets to lengths when printing
print(
torch.IntTensor(
[
np.diff(
S_o.detach().cpu().tolist() + list(lS_i[i].shape)
).tolist()
for i, S_o in enumerate(lS_o)
]
)
)
print([S_i.detach().cpu() for S_i in lS_i])
print(T.detach().cpu())
global ndevices
ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
### construct the neural network specified above ###
# WARNING: to obtain exactly the same initialization for
# the weights we need to start from the same random seed.
# np.random.seed(args.numpy_rand_seed)
global dlrm
dlrm = DLRM_Net(
m_spa,
ln_emb,
ln_bot,
ln_top,
arch_interaction_op=args.arch_interaction_op,
arch_interaction_itself=args.arch_interaction_itself,
sigmoid_bot=-1,
sigmoid_top=ln_top.size - 2,
sync_dense_params=args.sync_dense_params,
loss_threshold=args.loss_threshold,
ndevices=ndevices,
qr_flag=args.qr_flag,
qr_operation=args.qr_operation,
qr_collisions=args.qr_collisions,
qr_threshold=args.qr_threshold,
md_flag=args.md_flag,
md_threshold=args.md_threshold,
weighted_pooling=args.weighted_pooling,
loss_function=args.loss_function,
)
# test prints
if args.debug_mode:
print("initial parameters (weights and bias):")
for param in dlrm.parameters():
print(param.detach().cpu().numpy())
# print(dlrm)
if use_gpu:
# Custom Model-Data Parallel
# the mlps are replicated and use data parallelism, while
# the embeddings are distributed and use model parallelism
dlrm = dlrm.to(device) # .cuda()
if dlrm.ndevices > 1:
dlrm.emb_l, dlrm.v_W_l = dlrm.create_emb(
m_spa, ln_emb, args.weighted_pooling
)
else:
if dlrm.weighted_pooling == "fixed":
for k, w in enumerate(dlrm.v_W_l):
dlrm.v_W_l[k] = w.cuda()
# distribute data parallel mlps
if ext_dist.my_size > 1:
if use_gpu:
device_ids = [ext_dist.my_local_rank]
dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids)
dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids)
else:
dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
dlrm.top_l = ext_dist.DDP(dlrm.top_l)
if not args.inference_only:
if use_gpu and args.optimizer in ["rwsadagrad", "adagrad"]:
sys.exit("GPU version of Adagrad is not supported by PyTorch.")
# specify the optimizer algorithm
opts = {
"sgd": torch.optim.SGD,
"rwsadagrad": RowWiseSparseAdagrad.RWSAdagrad,
"adagrad": torch.optim.Adagrad,
}
parameters = (
dlrm.parameters()
if ext_dist.my_size == 1
else [
{
"params": [p for emb in dlrm.emb_l for p in emb.parameters()],
"lr": args.learning_rate,
},
# TODO check this lr setup
# bottom mlp has no data parallelism
# need to check how do we deal with top mlp
{
"params": dlrm.bot_l.parameters(),
"lr": args.learning_rate,
},
{
"params": dlrm.top_l.parameters(),
"lr": args.learning_rate,
},
]
)
optimizer = opts[args.optimizer](parameters, lr=args.learning_rate)
lr_scheduler = LRPolicyScheduler(
optimizer,
args.lr_num_warmup_steps,
args.lr_decay_start_step,
args.lr_num_decay_steps,
)
### main loop ###
# training or inference
best_acc_test = 0
best_auc_test = 0
skip_upto_epoch = 0
skip_upto_batch = 0
total_time = 0
total_loss = 0
total_iter = 0
total_samp = 0
if args.mlperf_logging:
mlperf_logger.mlperf_submission_log("dlrm")
mlperf_logger.log_event(
key=mlperf_logger.constants.SEED, value=args.numpy_rand_seed
)
mlperf_logger.log_event(
key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=args.mini_batch_size
)
# Load model is specified
if not (args.load_model == ""):
print("Loading saved model {}".format(args.load_model))
if use_gpu:
if dlrm.ndevices > 1:
# NOTE: when targeting inference on multiple GPUs,
# load the model as is on CPU or GPU, with the move
# to multiple GPUs to be done in parallel_forward
ld_model = torch.load(args.load_model)
else:
# NOTE: when targeting inference on single GPU,
# note that the call to .to(device) has already happened
ld_model = torch.load(
args.load_model,
map_location=torch.device("cuda"),
# map_location=lambda storage, loc: storage.cuda(0)
)
else:
# when targeting inference on CPU
ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
dlrm.load_state_dict(ld_model["state_dict"])
ld_j = ld_model["iter"]
ld_k = ld_model["epoch"]
ld_nepochs = ld_model["nepochs"]
ld_nbatches = ld_model["nbatches"]
ld_nbatches_test = ld_model["nbatches_test"]
ld_train_loss = ld_model["train_loss"]
ld_total_loss = ld_model["total_loss"]
if args.mlperf_logging:
ld_gAUC_test = ld_model["test_auc"]
ld_acc_test = ld_model["test_acc"]
if not args.inference_only:
optimizer.load_state_dict(ld_model["opt_state_dict"])
best_acc_test = ld_acc_test
total_loss = ld_total_loss
skip_upto_epoch = ld_k # epochs
skip_upto_batch = ld_j # batches
else:
args.print_freq = ld_nbatches
args.test_freq = 0
print(
"Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
)
)
print(
"Training state: loss = {:.6f}".format(
ld_train_loss,
)
)
if args.mlperf_logging:
print(
"Testing state: accuracy = {:3.3f} %, auc = {:.3f}".format(
ld_acc_test * 100, ld_gAUC_test
)
)
else:
print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100))
if args.inference_only:
# Currently only dynamic quantization with INT8 and FP16 weights are
# supported for MLPs and INT4 and INT8 weights for EmbeddingBag
# post-training quantization during the inference.
# By default we don't do the quantization: quantize_{mlp,emb}_with_bit == 32 (FP32)
assert args.quantize_mlp_with_bit in [
8,
16,
32,
], "only support 8/16/32-bit but got {}".format(args.quantize_mlp_with_bit)
assert args.quantize_emb_with_bit in [
4,
8,
32,
], "only support 4/8/32-bit but got {}".format(args.quantize_emb_with_bit)
if args.quantize_mlp_with_bit != 32:
if args.quantize_mlp_with_bit in [8]:
quantize_dtype = torch.qint8
else:
quantize_dtype = torch.float16
dlrm = torch.quantization.quantize_dynamic(
dlrm, {torch.nn.Linear}, quantize_dtype
)
if args.quantize_emb_with_bit != 32:
dlrm.quantize_embedding(args.quantize_emb_with_bit)
# print(dlrm)
print("time/loss/accuracy (if enabled):")
if args.mlperf_logging:
# LR is logged twice for now because of a compliance checker bug
mlperf_logger.log_event(
key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate
)
mlperf_logger.log_event(
key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS,
value=args.lr_num_warmup_steps,
)
# use logging keys from the official HP table and not from the logging library
mlperf_logger.log_event(
key="sgd_opt_base_learning_rate", value=args.learning_rate
)
mlperf_logger.log_event(
key="lr_decay_start_steps", value=args.lr_decay_start_step
)
mlperf_logger.log_event(
key="sgd_opt_learning_rate_decay_steps", value=args.lr_num_decay_steps
)
mlperf_logger.log_event(key="sgd_opt_learning_rate_decay_poly_power", value=2)
tb_file = "./" + args.tensor_board_filename
writer = SummaryWriter(tb_file)
ext_dist.barrier()
with torch.autograd.profiler.profile(
args.enable_profiling, use_cuda=use_gpu, record_shapes=True
) as prof:
if not args.inference_only:
k = 0
total_time_begin = 0
while k < args.nepochs:
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_start(
key=mlperf_logger.constants.BLOCK_START,
metadata={
mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1),
mlperf_logger.constants.EPOCH_COUNT: 1,
},
)
mlperf_logger.barrier()
mlperf_logger.log_start(
key=mlperf_logger.constants.EPOCH_START,
metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
)
if k < skip_upto_epoch:
continue
if args.mlperf_logging:
previous_iteration_time = None
for j, inputBatch in enumerate(train_ld):
if j == 0 and args.save_onnx:
X_onnx, lS_o_onnx, lS_i_onnx, _, _, _ = unpack_batch(inputBatch)
if j < skip_upto_batch:
continue
X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
if args.mlperf_logging:
current_time = time_wrap(use_gpu)
if previous_iteration_time:
iteration_time = current_time - previous_iteration_time
else:
iteration_time = 0
previous_iteration_time = current_time
else:
t1 = time_wrap(use_gpu)
# early exit if nbatches was set by the user and has been exceeded
if nbatches > 0 and j >= nbatches:
break
# Skip the batch if batch size not multiple of total ranks
if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0:
print(
"Warning: Skiping the batch %d with size %d"
% (j, X.size(0))
)
continue
mbs = T.shape[0] # = args.mini_batch_size except maybe for last
# forward pass
Z = dlrm_wrap(
X,
lS_o,
lS_i,
use_gpu,
device,
ndevices=ndevices,
)
if ext_dist.my_size > 1:
T = T[ext_dist.get_my_slice(mbs)]
W = W[ext_dist.get_my_slice(mbs)]
# loss
E = loss_fn_wrap(Z, T, use_gpu, device)
# compute loss and accuracy
L = E.detach().cpu().numpy() # numpy array
# training accuracy is not disabled
# S = Z.detach().cpu().numpy() # numpy array
# T = T.detach().cpu().numpy() # numpy array
# # print("res: ", S)
# # print("j, train: BCE ", j, L)
# mbs = T.shape[0] # = args.mini_batch_size except maybe for last
# A = np.sum((np.round(S, 0) == T).astype(np.uint8))
with record_function("DLRM backward"):
# scaled error gradient propagation
# (where we do not accumulate gradients across mini-batches)
if (
args.mlperf_logging
and (j + 1) % args.mlperf_grad_accum_iter == 0
) or not args.mlperf_logging:
optimizer.zero_grad()
# backward pass
E.backward()
# optimizer
if (
args.mlperf_logging
and (j + 1) % args.mlperf_grad_accum_iter == 0
) or not args.mlperf_logging:
optimizer.step()
lr_scheduler.step()
if args.mlperf_logging:
total_time += iteration_time
else:
t2 = time_wrap(use_gpu)
total_time += t2 - t1
total_loss += L * mbs
total_iter += 1
total_samp += mbs
should_print = ((j + 1) % args.print_freq == 0) or (
j + 1 == nbatches
)
should_test = (
(args.test_freq > 0)
and (args.data_generation in ["dataset", "random"])
and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
)
# print time, loss and accuracy
if should_print or should_test:
gT = 1000.0 * total_time / total_iter if args.print_time else -1
total_time = 0
train_loss = total_loss / total_samp
total_loss = 0
str_run_type = (
"inference" if args.inference_only else "training"
)
wall_time = ""
if args.print_wall_time:
wall_time = " ({})".format(time.strftime("%H:%M"))
print(
"Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
str_run_type, j + 1, nbatches, k, gT
)
+ " loss {:.6f}".format(train_loss)
+ wall_time,
flush=True,
)
log_iter = nbatches * k + j + 1
writer.add_scalar("Train/Loss", train_loss, log_iter)
total_iter = 0
total_samp = 0
# testing
if should_test:
epoch_num_float = (j + 1) / len(train_ld) + k + 1
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_start(
key=mlperf_logger.constants.EVAL_START,
metadata={
mlperf_logger.constants.EPOCH_NUM: epoch_num_float
},
)
# don't measure training iter time in a test iteration
if args.mlperf_logging:
previous_iteration_time = None
print(
"Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
)
model_metrics_dict, is_best = inference(
args,
dlrm,
best_acc_test,
best_auc_test,
test_ld,
device,
use_gpu,
log_iter,
)
if (
is_best
and not (args.save_model == "")
and not args.inference_only
):
model_metrics_dict["epoch"] = k
model_metrics_dict["iter"] = j + 1
model_metrics_dict["train_loss"] = train_loss
model_metrics_dict["total_loss"] = total_loss
model_metrics_dict["opt_state_dict"] = (
optimizer.state_dict()
)
print("Saving model to {}".format(args.save_model))
torch.save(model_metrics_dict, args.save_model)
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_end(
key=mlperf_logger.constants.EVAL_STOP,
metadata={
mlperf_logger.constants.EPOCH_NUM: epoch_num_float
},
)
# Uncomment the line below to print out the total time with overhead
# print("Total test time for this group: {}" \
# .format(time_wrap(use_gpu) - accum_test_time_begin))
if (
args.mlperf_logging
and (args.mlperf_acc_threshold > 0)
and (best_acc_test > args.mlperf_acc_threshold)
):
print(
"MLPerf testing accuracy threshold "
+ str(args.mlperf_acc_threshold)
+ " reached, stop training"
)
break
if (
args.mlperf_logging
and (args.mlperf_auc_threshold > 0)
and (best_auc_test > args.mlperf_auc_threshold)
):
print(
"MLPerf testing auc threshold "
+ str(args.mlperf_auc_threshold)
+ " reached, stop training"
)
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_end(
key=mlperf_logger.constants.RUN_STOP,
metadata={
mlperf_logger.constants.STATUS: mlperf_logger.constants.SUCCESS
},
)
break
if args.mlperf_logging:
mlperf_logger.barrier()
mlperf_logger.log_end(
key=mlperf_logger.constants.EPOCH_STOP,
metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
)
mlperf_logger.barrier()
mlperf_logger.log_end(
key=mlperf_logger.constants.BLOCK_STOP,
metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1)},
)
k += 1 # nepochs
if args.mlperf_logging and best_auc_test <= args.mlperf_auc_threshold:
mlperf_logger.barrier()
mlperf_logger.log_end(
key=mlperf_logger.constants.RUN_STOP,
metadata={
mlperf_logger.constants.STATUS: mlperf_logger.constants.ABORTED
},
)
else:
print("Testing for inference only")
inference(
args,
dlrm,
best_acc_test,
best_auc_test,
test_ld,
device,
use_gpu,
)
# profiling
if args.enable_profiling:
time_stamp = str(datetime.datetime.now()).replace(" ", "_")
with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f:
prof_f.write(
prof.key_averages(group_by_input_shape=True).table(
sort_by="self_cpu_time_total"
)
)
with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f:
prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total"))
prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json")
# print(prof.key_averages().table(sort_by="cpu_time_total"))
# plot compute graph
if args.plot_compute_graph:
sys.exit(
"ERROR: Please install pytorchviz package in order to use the"
+ " visualization. Then, uncomment its import above as well as"
+ " three lines below and run the code again."
)
# V = Z.mean() if args.inference_only else E
# dot = make_dot(V, params=dict(dlrm.named_parameters()))
# dot.render('dlrm_s_pytorch_graph') # write .pdf file
# test prints
if not args.inference_only and args.debug_mode:
print("updated parameters (weights and bias):")
for param in dlrm.parameters():
print(param.detach().cpu().numpy())
# export the model in onnx
if args.save_onnx:
"""
# workaround 1: tensor -> list
if torch.is_tensor(lS_i_onnx):
lS_i_onnx = [lS_i_onnx[j] for j in range(len(lS_i_onnx))]
# workaound 2: list -> tensor
lS_i_onnx = torch.stack(lS_i_onnx)
"""
# debug prints
# print("inputs", X_onnx, lS_o_onnx, lS_i_onnx)
# print("output", dlrm_wrap(X_onnx, lS_o_onnx, lS_i_onnx, use_gpu, device))
dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
batch_size = X_onnx.shape[0]
print("X_onnx.shape", X_onnx.shape)
if torch.is_tensor(lS_o_onnx):
print("lS_o_onnx.shape", lS_o_onnx.shape)
else:
for oo in lS_o_onnx:
print("oo.shape", oo.shape)
if torch.is_tensor(lS_i_onnx):
print("lS_i_onnx.shape", lS_i_onnx.shape)
else:
for ii in lS_i_onnx:
print("ii.shape", ii.shape)
# name inputs and outputs
o_inputs = (
["offsets"]
if torch.is_tensor(lS_o_onnx)
else ["offsets_" + str(i) for i in range(len(lS_o_onnx))]
)
i_inputs = (
["indices"]
if torch.is_tensor(lS_i_onnx)
else ["indices_" + str(i) for i in range(len(lS_i_onnx))]
)
all_inputs = ["dense_x"] + o_inputs + i_inputs
# debug prints
print("inputs", all_inputs)
# create dynamic_axis dictionaries
do_inputs = (
[{"offsets": {1: "batch_size"}}]
if torch.is_tensor(lS_o_onnx)
else [
{"offsets_" + str(i): {0: "batch_size"}} for i in range(len(lS_o_onnx))
]
)
di_inputs = (
[{"indices": {1: "batch_size"}}]
if torch.is_tensor(lS_i_onnx)
else [
{"indices_" + str(i): {0: "batch_size"}} for i in range(len(lS_i_onnx))
]
)
dynamic_axes = {"dense_x": {0: "batch_size"}, "pred": {0: "batch_size"}}
for do in do_inputs:
dynamic_axes.update(do)
for di in di_inputs:
dynamic_axes.update(di)
# debug prints
print(dynamic_axes)
# export model
torch.onnx.export(
dlrm,
(X_onnx, lS_o_onnx, lS_i_onnx),
dlrm_pytorch_onnx_file,
verbose=True,
opset_version=11,
input_names=all_inputs,
output_names=["pred"],
dynamic_axes=dynamic_axes,
dynamo=False,
)
# recover the model back
dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx")
# check the onnx model
onnx.checker.check_model(dlrm_pytorch_onnx)
total_time_end = time_wrap(use_gpu)
if __name__ == "__main__":
run()
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import builtins
import os
import sys
import torch
import torch.distributed as dist
from torch.autograd import Function
from torch.autograd.profiler import record_function
from torch.nn.parallel import DistributedDataParallel as DDP
try:
import torch_ccl
except ImportError as e:
# print(e)
torch_ccl = False
try:
import torch_ucc
except ImportError as e:
torch_ucc = False
my_rank = -1
my_size = -1
my_local_rank = -1
my_local_size = -1
alltoall_supported = False
a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "")
myreq = None
def env2int(env_list, default=-1):
for e in env_list:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default
def get_my_slice(n):
k, m = divmod(n, my_size)
return slice(
my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1
)
def get_split_lengths(n):
k, m = divmod(n, my_size)
if m == 0:
splits = None
my_len = k
else:
splits = [(k + 1) if i < m else k for i in range(my_size)]
my_len = splits[my_rank]
return (my_len, splits)
def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
global myreq
global my_rank
global my_size
global my_local_rank
global my_local_size
global a2a_impl
global alltoall_supported
# guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
num_mpi_ranks = env2int(
["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
)
if backend == "" and num_mpi_ranks > 1:
if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
backend = "ccl"
elif use_gpu and dist.is_nccl_available():
backend = "nccl"
elif dist.is_mpi_available():
backend = "mpi"
else:
print(
"WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
)
backend = "gloo"
if backend != "":
# guess Rank and size
if rank == -1:
rank = env2int(
["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
)
if size == -1:
size = env2int(
[
"PMI_SIZE",
"OMPI_COMM_WORLD_SIZE",
"MV2_COMM_WORLD_SIZE",
"WORLD_SIZE",
],
1,
)
if not os.environ.get("RANK", None) and rank != -1:
os.environ["RANK"] = str(rank)
if not os.environ.get("WORLD_SIZE", None) and size != -1:
os.environ["WORLD_SIZE"] = str(size)
if not os.environ.get("MASTER_PORT", None):
os.environ["MASTER_PORT"] = "29500"
if not os.environ.get("MASTER_ADDR", None):
local_size = env2int(
[
"MPI_LOCALNRANKS",
"OMPI_COMM_WORLD_LOCAL_SIZE",
"MV2_COMM_WORLD_LOCAL_SIZE",
],
1,
)
if local_size != size and backend != "mpi":
print(
"Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
)
print(
"If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
)
os.environ["MASTER_ADDR"] = "127.0.0.1"
if size > 1:
if local_rank == -1:
my_local_rank = env2int(
[
"MPI_LOCALRANKID",
"OMPI_COMM_WORLD_LOCAL_RANK",
"MV2_COMM_WORLD_LOCAL_RANK",
"LOCAL_RANK",
],
0,
)
else:
my_local_rank = local_rank
my_local_size = env2int(
[
"MPI_LOCALNRANKS",
"OMPI_COMM_WORLD_LOCAL_SIZE",
"MV2_COMM_WORLD_LOCAL_SIZE",
],
1,
)
if use_gpu:
if my_local_size > torch.cuda.device_count():
print(
"Not sufficient GPUs available... local_size = %d, ngpus = %d"
% (my_local_size, torch.cuda.device_count())
)
sys.exit(1)
torch.cuda.set_device(my_local_rank)
dist.init_process_group(backend, rank=rank, world_size=size)
my_rank = dist.get_rank()
my_size = dist.get_world_size()
if my_rank == 0:
print("Running on %d ranks using %s backend" % (my_size, backend))
if hasattr(dist, "all_to_all_single"):
try:
t = torch.zeros([4])
if use_gpu:
t = t.cuda()
dist.all_to_all_single(t, t)
alltoall_supported = True
except RuntimeError as err:
print("fail to enable all_to_all_single primitive: %s" % err)
if a2a_impl == "alltoall" and alltoall_supported == False:
print(
"Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
% (a2a_impl, backend)
)
a2a_impl = "scatter"
if a2a_impl != "":
print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
else:
my_rank = 0
my_size = 1
my_local_rank = 0
my_local_size = 1
print_all(
"world size: %d, current rank: %d, local rank: %d"
% (my_size, my_rank, my_local_rank)
)
myreq = Request()
class Request(object):
def __init__(self):
self.req = None
self.tensor = None
self.WaitFunction = All2All_Scatter_Wait
def wait(self):
ret = self.WaitFunction.apply(*self.tensor)
self.req = None
self.tensor = None
return ret
class All2All_ScatterList_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
gather_list = []
req_list = []
for i in range(my_size):
for j in range(table_split_lengths[i]):
out_tensor = inputs[0].new_empty(
[a2a_info.local_batch_num, a2a_info.emb_dim]
)
scatter_list = (
list(inputs[j].split(batch_split_lengths, dim=0))
if i == my_rank
else []
)
req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
gather_list.append(out_tensor)
req_list.append(req)
myreq.req = req_list
myreq.tensor = tuple(gather_list)
myreq.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
for r in myreq.req:
r.wait()
myreq.req = None
grad_inputs = myreq.tensor
myreq.tensor = None
return (None, *grad_inputs)
class All2All_ScatterList_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
ctx.a2a_info = myreq.a2a_info
for r in myreq.req:
r.wait()
myreq.req = None
myreq.tensor = None
return output
@staticmethod
def backward(ctx, *grad_output):
global myreq
a2a_info = ctx.a2a_info
grad_output = [t.contiguous() for t in grad_output]
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else [a2a_info.local_batch_num] * my_size
)
per_rank_table_splits = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
grad_inputs = [
grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim])
for _ in range(a2a_info.local_table_num)
]
req_list = []
ind = 0
for i in range(my_size):
for j in range(per_rank_table_splits[i]):
gather_list = (
list(grad_inputs[j].split(batch_split_lengths, dim=0))
if i == my_rank
else None
)
req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True)
req_list.append(req)
ind += 1
myreq.req = req_list
myreq.tensor = grad_inputs
return tuple(grad_output)
class All2All_Scatter_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
input = torch.cat(inputs, dim=1)
scatter_list = list(input.split(batch_split_lengths, dim=0))
gather_list = []
req_list = []
for i in range(my_size):
out_tensor = input.new_empty(
[a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim]
)
req = dist.scatter(
out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True
)
gather_list.append(out_tensor)
req_list.append(req)
myreq.req = req_list
myreq.tensor = tuple(gather_list)
myreq.a2a_info = a2a_info
ctx.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
for r in myreq.req:
r.wait()
myreq.req = None
grad_input = myreq.tensor
grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1)
myreq.tensor = None
return (None, *grad_inputs)
class All2All_Scatter_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
ctx.a2a_info = myreq.a2a_info
for r in myreq.req:
r.wait()
myreq.req = None
myreq.tensor = None
return output
@staticmethod
def backward(ctx, *grad_output):
global myreq
assert len(grad_output) == my_size
scatter_list = [t.contiguous() for t in grad_output]
a2a_info = ctx.a2a_info
batch_split_lengths = (
a2a_info.global_batch_partition_slices
if a2a_info.global_batch_partition_slices
else a2a_info.local_batch_num
)
table_split_lengths = (
a2a_info.global_table_wise_parition_slices
if a2a_info.global_table_wise_parition_slices
else [a2a_info.local_table_num] * my_size
)
grad_input = grad_output[0].new_empty(
[a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num]
)
gather_list = list(grad_input.split(batch_split_lengths, dim=0))
req_list = []
for i in range(my_size):
req = dist.gather(
scatter_list[i],
gather_list if i == my_rank else [],
dst=i,
async_op=True,
)
req_list.append(req)
myreq.req = req_list
myreq.tensor = grad_input
return grad_output
class All2All_Req(Function):
@staticmethod
def forward(ctx, a2a_info, *inputs):
global myreq
with record_function("DLRM alltoall_req_fwd_single"):
batch_split_lengths = a2a_info.global_batch_partition_slices
if batch_split_lengths:
batch_split_lengths = [
m * a2a_info.emb_dim * a2a_info.local_table_num
for m in batch_split_lengths
]
table_split_lengths = a2a_info.global_table_wise_parition_slices
if table_split_lengths:
table_split_lengths = [
a2a_info.local_batch_num * e * a2a_info.emb_dim
for e in table_split_lengths
]
input = torch.cat(inputs, dim=1).view([-1])
output = input.new_empty(
[
a2a_info.global_table_num
* a2a_info.local_batch_num
* a2a_info.emb_dim
]
)
req = dist.all_to_all_single(
output, input, table_split_lengths, batch_split_lengths, async_op=True
)
myreq.req = req
myreq.tensor = []
myreq.tensor.append(output)
myreq.tensor = tuple(myreq.tensor)
a2a_info.batch_split_lengths = batch_split_lengths
a2a_info.table_split_lengths = table_split_lengths
myreq.a2a_info = a2a_info
ctx.a2a_info = a2a_info
return myreq.tensor
@staticmethod
def backward(ctx, *grad_output):
global myreq
with record_function("DLRM alltoall_req_bwd_single"):
a2a_info = ctx.a2a_info
myreq.req.wait()
myreq.req = None
grad_input = myreq.tensor
grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split(
a2a_info.emb_dim, dim=1
)
grad_inputs = [gin.contiguous() for gin in grad_inputs]
myreq.tensor = None
return (None, *grad_inputs)
class All2All_Wait(Function):
@staticmethod
def forward(ctx, *output):
global myreq
with record_function("DLRM alltoall_wait_fwd_single"):
a2a_info = myreq.a2a_info
ctx.a2a_info = a2a_info
myreq.req.wait()
myreq.req = None
myreq.tensor = None
table_split_lengths = (
a2a_info.table_split_lengths
if a2a_info.table_split_lengths
else a2a_info.local_table_num
* a2a_info.local_batch_num
* a2a_info.emb_dim
)
outputs = output[0].split(table_split_lengths)
outputs = tuple(
[out.view([a2a_info.local_batch_num, -1]) for out in outputs]
)
return outputs
@staticmethod
def backward(ctx, *grad_outputs):
global myreq
with record_function("DLRM alltoall_wait_bwd_single"):
a2a_info = ctx.a2a_info
grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
grad_output = torch.cat(grad_outputs)
grad_input = grad_output.new_empty(
[a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim]
)
req = dist.all_to_all_single(
grad_input,
grad_output,
a2a_info.batch_split_lengths,
a2a_info.table_split_lengths,
async_op=True,
)
myreq.req = req
myreq.tensor = grad_input
return (grad_output,)
class AllGather(Function):
@staticmethod
def forward(ctx, input, global_lengths, dim=0):
if not isinstance(global_lengths, (list, tuple)):
global_lengths = [global_lengths] * my_size
assert len(global_lengths) == my_size
assert global_lengths[my_rank] == input.size(dim)
local_start = sum(global_lengths[:my_rank])
output_size = list(input.size())
ctx.dim = dim
ctx.local_start = local_start
ctx.local_length = global_lengths[my_rank]
input = input.contiguous()
if dim == 0:
out_len = sum(global_lengths)
output_size[dim] = out_len
output = input.new_empty(output_size)
gather_list = list(output.split(global_lengths, dim=0))
else:
gather_list = [torch.empty_like(input) for _ in range(my_size)]
gather_list = []
for length in global_lengths:
output_size[dim] = length
gather_list.append(input.new_empty(output_size))
dist.all_gather(gather_list, input)
if dim != 0:
output = torch.cat(gather_list, dim=dim)
return output
@staticmethod
def backward(ctx, grad_output):
# print("Inside All2AllBackward")
dim = ctx.dim
start = ctx.local_start
length = ctx.local_length
grad_input = grad_output.narrow(dim, start, length)
return (grad_input, None, None)
class All2AllInfo(object):
pass
def alltoall(inputs, per_rank_table_splits):
global myreq
batch_size, emb_dim = inputs[0].size()
a2a_info = All2AllInfo()
a2a_info.local_table_num = len(inputs)
a2a_info.global_table_wise_parition_slices = per_rank_table_splits
(
a2a_info.local_batch_num,
a2a_info.global_batch_partition_slices,
) = get_split_lengths(batch_size)
a2a_info.emb_dim = emb_dim
a2a_info.batch_size = batch_size
a2a_info.global_table_num = (
sum(per_rank_table_splits)
if per_rank_table_splits
else a2a_info.local_table_num * my_size
)
if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall":
# print("Using All2All_Req")
output = All2All_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_Wait
elif a2a_impl == "" or a2a_impl == "scatter":
# print("Using All2All_Scatter_Req")
output = All2All_Scatter_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_Scatter_Wait
elif a2a_impl == "scatter_list":
# print("Using All2All_ScatterList_Req")
output = All2All_ScatterList_Req.apply(a2a_info, *inputs)
myreq.WaitFunction = All2All_ScatterList_Wait
else:
print(
"Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
"please use one of [alltoall, scatter, scatter_list]" % a2a_impl
)
return myreq
def all_gather(input, lengths, dim=0):
if not lengths:
lengths = [input.size(0)] * my_size
return AllGather.apply(input, lengths, dim)
def barrier():
if my_size > 1:
dist.barrier()
# Override builtin print function to print only from rank 0
orig_print = builtins.print
def rank0_print(*args, **kwargs):
if my_rank <= 0 or kwargs.get("print_all", False):
orig_print(*args, **kwargs)
builtins.print = rank0_print
# Allow printing from all rank with explicit print_all
def print_all(*args, **kwargs):
orig_print(*args, **kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment