Merge branch 'add_Recommendation' into 'main'

添加VAE-CF和dlrm See merge request dcutoolkit/deeplearing/dlexamples_new!24

Merge branch 'add_Recommendation' into 'main'
添加VAE-CF和dlrm See merge request dcutoolkit/deeplearing/dlexamples_new!24
c43a53e4 · sunxx1 · 5394b117 · 56225fdf · c43a53e4 · c43a53e4
Commit c43a53e4 authored Jul 14, 2022 by sunxx1
17 changed files
--- a/PyTorch/Recommendation/dlrm/kaggle_dac_loss_accuracy_plots.png
+++ b/PyTorch/Recommendation/dlrm/kaggle_dac_loss_accuracy_plots.png
--- a/PyTorch/Recommendation/dlrm/mlperf_logger.py
+++ b/PyTorch/Recommendation/dlrm/mlperf_logger.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+Utilities for MLPerf logging
+"""
+import os
+import torch
+
+try:
+    from mlperf_logging import mllog
+    from mlperf_logging.mllog import constants
+    _MLLOGGER = mllog.get_mllogger()
+except ImportError as error:
+        print("Unable to import mlperf_logging, ", error)
+
+
+def log_start(*args, **kwargs):
+    "log with start tag"
+    _log_print(_MLLOGGER.start, *args, **kwargs)
+
+
+def log_end(*args, **kwargs):
+    "log with end tag"
+    _log_print(_MLLOGGER.end, *args, **kwargs)
+
+
+def log_event(*args, **kwargs):
+    "log with event tag"
+    _log_print(_MLLOGGER.event, *args, **kwargs)
+
+
+def _log_print(logger, *args, **kwargs):
+    "makes mlperf logger aware of distributed execution"
+    if 'stack_offset' not in kwargs:
+        kwargs['stack_offset'] = 3
+    if 'value' not in kwargs:
+        kwargs['value'] = None
+
+    if kwargs.pop('log_all_ranks', False):
+        log = True
+    else:
+        log = (get_rank() == 0)
+
+    if log:
+        logger(*args, **kwargs)
+
+
+def config_logger(benchmark):
+    "initiates mlperf logger"
+    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
+    _MLLOGGER.logger.propagate = False
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
+
+def mlperf_submission_log(benchmark):
+    """
+    Logs information needed for MLPerf submission
+    """
+
+    config_logger(benchmark)
+
+    log_event(
+        key=constants.SUBMISSION_BENCHMARK,
+        value=benchmark,
+        )
+
+    log_event(
+        key=constants.SUBMISSION_ORG,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_DIVISION,
+        value='closed')
+
+    log_event(
+        key=constants.SUBMISSION_STATUS,
+        value='onprem')
+
+    log_event(
+        key=constants.SUBMISSION_PLATFORM,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_ENTRY,
+        value="reference_implementation")
+
+    log_event(
+        key=constants.SUBMISSION_POC_NAME,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_POC_EMAIL,
+        value='reference_implementation')
--- a/PyTorch/Recommendation/dlrm/optim/rwsadagrad.py
+++ b/PyTorch/Recommendation/dlrm/optim/rwsadagrad.py
--- a/PyTorch/Recommendation/dlrm/requirements.txt
+++ b/PyTorch/Recommendation/dlrm/requirements.txt
+future
+numpy
+onnx
+pydot
+torch
+torchviz
+scikit-learn
+tqdm
+torchrec-nightly
+torchx-nightly
--- a/PyTorch/Recommendation/dlrm/terabyte_0875_loss_accuracy_plots.png
+++ b/PyTorch/Recommendation/dlrm/terabyte_0875_loss_accuracy_plots.png
--- a/PyTorch/Recommendation/dlrm/test/dlrm_s_test.sh
+++ b/PyTorch/Recommendation/dlrm/test/dlrm_s_test.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_py="python dlrm_s_pytorch.py"
+dlrm_c2="python dlrm_s_caffe2.py"
+
+echo "Running commands ..."
+#run pytorch
+echo $dlrm_py
+$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
+$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
+
+#run caffe2
+echo $dlrm_c2
+$dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1
+$dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2
+$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3
+$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4
+
+echo "Checking results ..."
+#check results
+#WARNING: correct test will have no difference in numeric values
+#(but might have some verbal difference, e.g. due to warnnings)
+#in the output file
+echo "diff test1 (no numeric values in the output = SUCCESS)"
+diff ccc1 ppp1
+echo "diff test2 (no numeric values in the output = SUCCESS)"
+diff ccc2 ppp2
+echo "diff test3 (no numeric values in the output = SUCCESS)"
+diff ccc3 ppp3
+echo "diff test4 (no numeric values in the output = SUCCESS)"
+diff ccc4 ppp4
--- a/PyTorch/Recommendation/dlrm/tools/visualize.py
+++ b/PyTorch/Recommendation/dlrm/tools/visualize.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/README.MD
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/README.MD
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/__init__.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/__init__.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/aws_component.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/aws_component.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/data/__init__.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/data/__init__.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/data/dlrm_dataloader.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/data/dlrm_dataloader.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/dlrm_main.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/dlrm_main.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/multi_hot.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/multi_hot.py
--- a/PyTorch/Recommendation/dlrm/torchrec_dlrm/tests/test_dlrm_main.py
+++ b/PyTorch/Recommendation/dlrm/torchrec_dlrm/tests/test_dlrm_main.py
--- a/PyTorch/Recommendation/dlrm/tricks/md_embedding_bag.py
+++ b/PyTorch/Recommendation/dlrm/tricks/md_embedding_bag.py
--- a/PyTorch/Recommendation/dlrm/tricks/qr_embedding_bag.py
+++ b/PyTorch/Recommendation/dlrm/tricks/qr_embedding_bag.py