Commit c43a53e4 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'add_Recommendation' into 'main'

添加VAE-CF和dlrm

See merge request dcutoolkit/deeplearing/dlexamples_new!24
parents 5394b117 56225fdf
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Utilities for MLPerf logging
"""
import os
import torch
try:
from mlperf_logging import mllog
from mlperf_logging.mllog import constants
_MLLOGGER = mllog.get_mllogger()
except ImportError as error:
print("Unable to import mlperf_logging, ", error)
def log_start(*args, **kwargs):
"log with start tag"
_log_print(_MLLOGGER.start, *args, **kwargs)
def log_end(*args, **kwargs):
"log with end tag"
_log_print(_MLLOGGER.end, *args, **kwargs)
def log_event(*args, **kwargs):
"log with event tag"
_log_print(_MLLOGGER.event, *args, **kwargs)
def _log_print(logger, *args, **kwargs):
"makes mlperf logger aware of distributed execution"
if 'stack_offset' not in kwargs:
kwargs['stack_offset'] = 3
if 'value' not in kwargs:
kwargs['value'] = None
if kwargs.pop('log_all_ranks', False):
log = True
else:
log = (get_rank() == 0)
if log:
logger(*args, **kwargs)
def config_logger(benchmark):
"initiates mlperf logger"
mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
_MLLOGGER.logger.propagate = False
def barrier():
"""
Works as a temporary distributed barrier, currently pytorch
doesn't implement barrier for NCCL backend.
Calls all_reduce on dummy tensor and synchronizes with GPU.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
torch.cuda.synchronize()
def get_rank():
"""
Gets distributed rank or returns zero if distributed is not initialized.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
else:
rank = 0
return rank
def mlperf_submission_log(benchmark):
"""
Logs information needed for MLPerf submission
"""
config_logger(benchmark)
log_event(
key=constants.SUBMISSION_BENCHMARK,
value=benchmark,
)
log_event(
key=constants.SUBMISSION_ORG,
value='reference_implementation')
log_event(
key=constants.SUBMISSION_DIVISION,
value='closed')
log_event(
key=constants.SUBMISSION_STATUS,
value='onprem')
log_event(
key=constants.SUBMISSION_PLATFORM,
value='reference_implementation')
log_event(
key=constants.SUBMISSION_ENTRY,
value="reference_implementation")
log_event(
key=constants.SUBMISSION_POC_NAME,
value='reference_implementation')
log_event(
key=constants.SUBMISSION_POC_EMAIL,
value='reference_implementation')
This diff is collapsed.
future
numpy
onnx
pydot
torch
torchviz
scikit-learn
tqdm
torchrec-nightly
torchx-nightly
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2
#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option
dlrm_py="python dlrm_s_pytorch.py"
dlrm_c2="python dlrm_s_caffe2.py"
echo "Running commands ..."
#run pytorch
echo $dlrm_py
$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
#run caffe2
echo $dlrm_c2
$dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1
$dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2
$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3
$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4
echo "Checking results ..."
#check results
#WARNING: correct test will have no difference in numeric values
#(but might have some verbal difference, e.g. due to warnnings)
#in the output file
echo "diff test1 (no numeric values in the output = SUCCESS)"
diff ccc1 ppp1
echo "diff test2 (no numeric values in the output = SUCCESS)"
diff ccc2 ppp2
echo "diff test3 (no numeric values in the output = SUCCESS)"
diff ccc3 ppp3
echo "diff test4 (no numeric values in the output = SUCCESS)"
diff ccc4 ppp4
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment