Commit 9c8a2a14 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3002 canceled with stages
# @lint-ignore-every LICENSELINT
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Utilities for MLPerf logging
"""
import os
import torch
try:
from mlperf_logging import mllog
from mlperf_logging.mllog import constants
_MLLOGGER = mllog.get_mllogger()
except ImportError as error:
print("Unable to import mlperf_logging, ", error)
def log_start(*args, **kwargs):
"log with start tag"
_log_print(_MLLOGGER.start, *args, **kwargs)
def log_end(*args, **kwargs):
"log with end tag"
_log_print(_MLLOGGER.end, *args, **kwargs)
def log_event(*args, **kwargs):
"log with event tag"
_log_print(_MLLOGGER.event, *args, **kwargs)
def _log_print(logger, *args, **kwargs):
"makes mlperf logger aware of distributed execution"
if "stack_offset" not in kwargs:
kwargs["stack_offset"] = 3
if "value" not in kwargs:
kwargs["value"] = None
if kwargs.pop("log_all_ranks", False):
log = True
else:
log = get_rank() == 0
if log:
logger(*args, **kwargs)
def config_logger(benchmark):
"initiates mlperf logger"
mllog.config(
filename=os.path.join(
os.path.dirname(os.path.abspath(__file__)), f"{benchmark}.log"
)
)
_MLLOGGER.logger.propagate = False
def barrier():
"""
Works as a temporary distributed barrier, currently pytorch
doesn't implement barrier for NCCL backend.
Calls all_reduce on dummy tensor and synchronizes with GPU.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
torch.cuda.synchronize()
def get_rank():
"""
Gets distributed rank or returns zero if distributed is not initialized.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
else:
rank = 0
return rank
def mlperf_submission_log(benchmark):
"""
Logs information needed for MLPerf submission
"""
config_logger(benchmark)
log_event(
key=constants.SUBMISSION_BENCHMARK,
value=benchmark,
)
log_event(key=constants.SUBMISSION_ORG, value="reference_implementation")
log_event(key=constants.SUBMISSION_DIVISION, value="closed")
log_event(key=constants.SUBMISSION_STATUS, value="onprem")
log_event(key=constants.SUBMISSION_PLATFORM, value="reference_implementation")
log_event(key=constants.SUBMISSION_ENTRY, value="reference_implementation")
log_event(key=constants.SUBMISSION_POC_NAME, value="reference_implementation")
log_event(key=constants.SUBMISSION_POC_EMAIL, value="reference_implementation")
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from torch.optim import Optimizer
class RWSAdagrad(Optimizer):
"""Implements Row Wise Sparse Adagrad algorithm.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-2)
lr_decay (float, optional): learning rate decay (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-10)
"""
def __init__(
self,
params,
lr=1e-2,
lr_decay=0.0,
weight_decay=0.0,
initial_accumulator_value=0.0,
eps=1e-10,
):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= lr_decay:
raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= initial_accumulator_value:
raise ValueError(
"Invalid initial_accumulator_value value: {}".format(
initial_accumulator_value
)
)
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
self.defaults = dict(
lr=lr,
lr_decay=lr_decay,
eps=eps,
weight_decay=weight_decay,
initial_accumulator_value=initial_accumulator_value,
)
super(RWSAdagrad, self).__init__(params, self.defaults)
self.momentum_initialized = False
for group in self.param_groups:
for p in group["params"]:
self.state[p]["step"] = 0
def share_memory(self):
for group in self.param_groups:
for p in group["params"]:
state = self.state[p]
if p.grad.data.is_sparse:
state["momentum"].share_memory_()
else:
state["sum"].share_memory_()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group["params"]:
if p.grad is None:
continue
if not self.momentum_initialized:
if p.grad.data.is_sparse:
self.state[p]["momentum"] = torch.full(
[p.data.shape[0]],
self.defaults["initial_accumulator_value"],
dtype=torch.float32,
)
else:
self.state[p]["sum"] = torch.full_like(
p.data,
self.defaults["initial_accumulator_value"],
dtype=torch.float32,
)
grad = p.grad
state = self.state[p]
state["step"] += 1
if group["weight_decay"] != 0:
if p.grad.data.is_sparse:
raise RuntimeError(
"weight_decay option is not compatible with sparse gradients"
)
grad = grad.add(group["weight_decay"], p.data)
clr = group["lr"] / (1.0 + (state["step"] - 1.0) * group["lr_decay"])
if grad.is_sparse:
grad = (
grad.coalesce()
) # the update is non-linear so indices must be unique
grad_indices = grad._indices()
grad_values = grad._values()
size = grad.size()
def make_sparse(values, row_wise):
constructor = grad.new
matrix_size = [size[0]] if row_wise else size
return constructor(grad_indices, values, matrix_size)
if grad_values.numel() > 0:
momentum_update = make_sparse(
grad_values.pow(2).mean(dim=1), True
)
state["momentum"].add_(momentum_update) # update momentum
std = state["momentum"].sparse_mask(momentum_update.coalesce())
std_values = std._values().sqrt_().add_(group["eps"])
p.data.add_(
make_sparse(
grad_values / std_values.view(std_values.size()[0], 1),
False,
),
alpha=-clr,
)
else:
state["sum"].addcmul_(grad, grad, value=1.0)
std = state["sum"].sqrt().add_(group["eps"])
p.data.addcdiv_(grad, std, value=-clr)
self.momentum_initialized = True
return loss
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#WARNING: must have compiled PyTorch
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_py="python dlrm_s_pytorch.py"
echo "Running commands ..."
#run pytorch
echo $dlrm_py
$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
echo "All PyTorch tests completed."
echo "Output files: ppp1, ppp2, ppp3, ppp4"
\ No newline at end of file
This diff is collapsed.
ARG FROM_IMAGE_NAME=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
FROM ${FROM_IMAGE_NAME}
WORKDIR /workspace/torchrec_dlrm
COPY . .
RUN pip install --no-cache-dir -r requirements.txt
This diff is collapsed.
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import torchx.specs as specs
from torchx.components.dist import ddp
def run_dlrm_main(num_trainers: int = 8, *script_args: str) -> specs.AppDef:
"""
Args:
num_trainers: The number of trainers to use.
script_args: A variable number of parameters to provide dlrm_main.py.
"""
cwd = os.getcwd()
entrypoint = os.path.join(cwd, "dlrm_main.py")
user = os.environ.get("USER")
image = f"/data/home/{user}"
if num_trainers > 8 and num_trainers % 8 != 0:
raise ValueError(
"Trainer jobs spanning multiple hosts must be in multiples of 8."
)
nproc_per_node = 8 if num_trainers >= 8 else num_trainers
num_replicas = max(num_trainers // 8, 1)
return ddp(
*script_args,
name="train_dlrm",
image=image,
# AWS p4d instance (https://aws.amazon.com/ec2/instance-types/p4/).
cpu=96,
gpu=8,
memMB=-1,
script=entrypoint,
j=f"{num_replicas}x{nproc_per_node}",
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Copied from https://github.com/facebookresearch/dlrm/blob/mlperf/dlrm_s_pytorch.py
import sys
from torch.optim.lr_scheduler import _LRScheduler
class LRPolicyScheduler(_LRScheduler):
def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
self.num_warmup_steps = num_warmup_steps
self.decay_start_step = decay_start_step
self.decay_end_step = decay_start_step + num_decay_steps
self.num_decay_steps = num_decay_steps
if self.decay_start_step < self.num_warmup_steps:
sys.exit("Learning rate warmup must finish before the decay starts")
super(LRPolicyScheduler, self).__init__(optimizer)
def get_lr(self):
step_count = self._step_count
if step_count < self.num_warmup_steps:
# warmup
scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
lr = [base_lr * scale for base_lr in self.base_lrs]
self.last_lr = lr
elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
# decay
decayed_steps = step_count - self.decay_start_step
scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
min_lr = 0.0000001
lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
self.last_lr = lr
else:
if self.num_decay_steps > 0:
# freeze at last, either because we're after decay
# or because we're between warmup and decay
lr = self.last_lr
else:
# do not adjust
lr = self.base_lrs
return lr
9287c283b01087427df915257300bf36 day_0_sparse_multi_hot.npz
f4fedd921a4b214b03d0dfcd31cc30e6 day_1_sparse_multi_hot.npz
2a15bdd8d25781c4cdcf5d791dfd19f9 day_2_sparse_multi_hot.npz
0341aaee2f661f9e939a39d7a6be0aea day_3_sparse_multi_hot.npz
e8db54dbf5fe438ecb76fcc2d520f31a day_4_sparse_multi_hot.npz
fd35a7a2bc0ba63935b4b1c742eca018 day_5_sparse_multi_hot.npz
7d5c72b6bbe8be1dfa1f69db5c7e64fd day_6_sparse_multi_hot.npz
59bcb9855243d3a5c0ae56daa18e1033 day_7_sparse_multi_hot.npz
b9f7fbccae6bb9fdabf30259b5e305f0 day_8_sparse_multi_hot.npz
03da5bf484870a3c77f66befc680ad04 day_9_sparse_multi_hot.npz
eb048fc4fbd7ffa7932b81a523fe5a39 day_10_sparse_multi_hot.npz
a2ebee45c9836c8c8598610a6a3e9d60 day_11_sparse_multi_hot.npz
0dd59855e1a7b65c42f7e7af303c610e day_12_sparse_multi_hot.npz
7510698f3fcff9f3d7ef9dd478e637aa day_13_sparse_multi_hot.npz
562978b7b93e179f1adb9b9f5e1dc338 day_14_sparse_multi_hot.npz
042967232f016fbccf0d40d72d0b48bb day_15_sparse_multi_hot.npz
7b59170fb0e2d1e78f15cb60cea22723 day_16_sparse_multi_hot.npz
5054c54515d2574cda0f11646787df44 day_17_sparse_multi_hot.npz
28d3dbf6c70e68f01df12a4c3298f754 day_18_sparse_multi_hot.npz
db7554263a1754d3e29341d0f03bc2f0 day_19_sparse_multi_hot.npz
91ee92ffb4810c26e157c1335ef4de06 day_20_sparse_multi_hot.npz
2c99fad7b146b0ba581dce34f640f44e day_21_sparse_multi_hot.npz
c7ba52c5aaf24d76acca22a0cb13b737 day_22_sparse_multi_hot.npz
c46b7e31ec6f2f8768fa60bdfc0f6e40 day_23_sparse_multi_hot.npz
427113b0c4d85a8fceaf793457302067 day_0_dense.npy
4db255ce4388893e7aa1dcf157077975 day_0_labels.npy
8b444e74159dbede896e2f3b5ed31ac0 day_0_sparse.npy
3afc11c56062d8bbea4df300b5a42966 day_1_dense.npy
fb40746738a7c6f4ee021033bdd518c5 day_1_labels.npy
61e95a487c955b515155b31611444f32 day_1_sparse.npy
4e73d5bb330c43826665bec142c6b407 day_2_dense.npy
f0adfec8191781e3f201d45f923e6ea1 day_2_labels.npy
0473d30872cd6e582c5da0272a0569f8 day_2_sparse.npy
df1f3395e0da4a06aa23b2e069ff3ad9 day_3_dense.npy
69caadf4d219f18b83f3591fe76f17c7 day_3_labels.npy
d6b0d02ff18da470b7ee17f97d5380e0 day_3_sparse.npy
27868a93adc66c47d4246acbad8bb689 day_4_dense.npy
c4a6a16342f0770d67d689c6c173c681 day_4_labels.npy
ca54008489cb84becc3f37e7b29035c7 day_4_sparse.npy
e9bc6de06d09b1feebf857d9786ee15c day_5_dense.npy
9e3e17f345474cfbde5d62b543e07d6b day_5_labels.npy
d1374ee84f80ea147957f8af0e12ebe4 day_5_sparse.npy
09c8bf0fd4798172e0369134ddc7204a day_6_dense.npy
945cef1132ceab8b23f4d0e269522be2 day_6_labels.npy
e4df1c271e1edd72ee4658a39cca2888 day_6_sparse.npy
ae718f0d6d29a8b605ae5d12fad3ffcc day_7_dense.npy
5ff5e7eef5b88b80ef03d06fc7e81bcf day_7_labels.npy
cbcb7501a6b74a45dd5c028c13a4afbc day_7_sparse.npy
5a589746fd15819afbc70e2503f94b35 day_8_dense.npy
43871397750dfdc69cadcbee7e95f2bd day_8_labels.npy
c1fb4369c7da27d23f4c7f97c8893250 day_8_sparse.npy
4bb86eecb92eb4e3368085c2b1bab131 day_9_dense.npy
f851934555147d436131230ebbdd5609 day_9_labels.npy
e4ac0fb8a030f0769541f88142c9f931 day_9_sparse.npy
7fc29f50da6c60185381ca4ad1cb2059 day_10_dense.npy
e3b3f6f974c4820064db0046bbf954c8 day_10_labels.npy
1018a9ab88c4a7369325c9d6df73b411 day_10_sparse.npy
df822ae73cbaa016bf7d371d87313b56 day_11_dense.npy
26219e9c89c6ce831e7da273da666df1 day_11_labels.npy
f1596fc0337443a6672a864cd541fb05 day_11_sparse.npy
015968b4d9940ec9e28cc34788013d6e day_12_dense.npy
f0ca7ce0ab6033cdd355df94d11c7ed7 day_12_labels.npy
03a2ebd22b01cc18b6e338de77b4103f day_12_sparse.npy
9d79239a9e976e4dd9b8839c7cbe1eba day_13_dense.npy
4b099b9200bbb490afc08b5cd63daa0e day_13_labels.npy
2b507e0f97d972ea6ada9b3af64de151 day_13_sparse.npy
9242e6c974603ec235f163f72fdbc766 day_14_dense.npy
80cae15e032ffb9eff292738ba4d0dce day_14_labels.npy
3dccc979f7c71fae45a10c98ba6c9cb7 day_14_sparse.npy
64c6c0fcd0940f7e0d7001aa945ec8f8 day_15_dense.npy
a6a730d1ef55368f3f0b21d32b039662 day_15_labels.npy
c852516852cc404cb40d4de8626d2ca1 day_15_sparse.npy
5c75b60e63e9cf98dec13fbb64839c10 day_16_dense.npy
5a71a29d8df1e8baf6bf28353f1588d4 day_16_labels.npy
6c838050751697a91bbf3e68ffd4a696 day_16_sparse.npy
9798bccb5a67c5eac834153ea8bbe110 day_17_dense.npy
0a814b7eb83f375dd5a555ade6908356 day_17_labels.npy
40d2bc23fbcccb3ddb1390cc5e694cf0 day_17_sparse.npy
cda094dfe7f5711877a6486f9863cd4b day_18_dense.npy
a4fa26ada0d4c312b7e3354de0f5ee30 day_18_labels.npy
51711de9194737813a74bfb25c0f5d30 day_18_sparse.npy
0f0b2c0ed279462cdcc6f79252fd3395 day_19_dense.npy
b21ad457474b01bd3f95fc46b6b9f04b day_19_labels.npy
dd4b72cd704981441d17687f526e42ae day_19_sparse.npy
95ffc084f6cafe382afe72cbcae186bc day_20_dense.npy
9555e572e8bee22d71db8c2ac121ea8a day_20_labels.npy
bc9a8c79c93ea39f32230459b4c4572a day_20_sparse.npy
4680683973be5b1a890c9314cfb2e93b day_21_dense.npy
672edc866e7ff1928d15338a99e5f336 day_21_labels.npy
e4a8ae42a6d46893da6edb73e7d8a3f7 day_21_sparse.npy
3d56f190639398da2bfdc33f87cd34f0 day_22_dense.npy
733da710c5981cb67d041aa1039e4e6b day_22_labels.npy
42ef88d6bb2550a88711fed6fc144846 day_22_sparse.npy
cdf7af87cbc7e9b468c0be46b1767601 day_23_dense.npy
dd68f93301812026ed6f58dfb0757fa7 day_23_labels.npy
0c33f1562529cc3bca7f3708e2be63c9 day_23_sparse.npy
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment