Merge pull request #13 from Ribin-Baby/dgx/bugfix

minor bug fixes

Merge pull request #13 from Ribin-Baby/dgx/bugfix
minor bug fixes
d06288b2 · Shriya Rishab · GitHub · 5b62935b · b93fbddc · d06288b2
Unverified Commit d06288b2 authored Jan 14, 2025 by Shriya Rishab Committed by GitHub Jan 14, 2025
8 changed files
--- a/NVIDIA/benchmarks/bert/implementations/pytorch/run_with_docker.sh
+++ b/NVIDIA/benchmarks/bert/implementations/pytorch/run_with_docker.sh
@@ -92,7 +92,13 @@ cleanup_docker
 trap 'set -eux; cleanup_docker' EXIT
 # Setup container
-nvidia-docker run --rm --init --detach --gpus='"'device=${NV_GPU}'"' \
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+docker run ${_docker_gpu_args} --rm --init --detach \
    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
    --ulimit=stack=67108864 --ulimit=memlock=-1 \
    --name="${_cont_name}" ${_cont_mounts[@]} \

--- a/NVIDIA/benchmarks/dlrm_dcnv2/implementations/hugectr/run_with_docker.sh
+++ b/NVIDIA/benchmarks/dlrm_dcnv2/implementations/hugectr/run_with_docker.sh
@@ -60,7 +60,13 @@ cleanup_docker
 trap 'set -eux; cleanup_docker' EXIT
 # Setup container
-nvidia-docker run --rm --init --detach \
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+docker run ${_docker_gpu_args} --rm --init --detach \
    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
    --name="${_cont_name}" "${_cont_mounts[@]}" \
    "${CONT}" sleep infinity

--- a/NVIDIA/benchmarks/maskrcnn/implementations/pytorch/run_with_docker.sh
+++ b/NVIDIA/benchmarks/maskrcnn/implementations/pytorch/run_with_docker.sh
@@ -60,7 +60,13 @@ cleanup_docker
 trap 'set -eux; cleanup_docker' EXIT
 # Setup container
-nvidia-docker run --rm --init --detach \
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+docker run ${_docker_gpu_args} --rm --init --detach \
    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
    --ulimit=stack=67108864 --ulimit=memlock=-1 \
    --name="${_cont_name}" "${_cont_mounts[@]}" \

--- a/NVIDIA/benchmarks/resnet/implementations/mxnet/common/fit.py
+++ b/NVIDIA/benchmarks/resnet/implementations/mxnet/common/fit.py
@@ -53,7 +53,9 @@ import cuda_graphs.graph_wrapper as graph_wrapper
 from common.data import SyntheticDataIter
-from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge
+# from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge
+from mlperf_common.scaleoutbridge import init_bridge, ScaleoutBridgeBase as SBridge
+from mlperf_common.frameworks.mxnet import MXNetProfilerHandler, MPICommunicationHandler
 TRAIN_CUDA_GRAPH_ID = 0
@@ -907,7 +909,8 @@ def mlperf_fit(self, args, train_data,
        key=mllogger.constants.BLOCK_START,
        metadata={'first_epoch_num': block_epoch_start + 1, 'epoch_count': block_epoch_count})
-    sbridge = init_bridge(hvd.rank())
+    #sbridge = init_bridge(hvd.rank())
+    sbridge = init_bridge(MXNetProfilerHandler(), MPICommunicationHandler(), mllogger)
    ################################################################################
    # training loop with dali overlap with fwd

--- a/NVIDIA/benchmarks/resnet/implementations/mxnet/common/optimizer.py
+++ b/NVIDIA/benchmarks/resnet/implementations/mxnet/common/optimizer.py
@@ -48,7 +48,8 @@ from mxnet.ndarray import sparse
 #####
 from mxnet import cuda_utils as cu
-from scaleoutbridge import ScaleoutBridge as SBridge
+# from scaleoutbridge import ScaleoutBridge as SBridge
+from mlperf_common.scaleoutbridge import ScaleoutBridgeBase as SBridge
 from common.data import SyntheticDataIter

--- a/NVIDIA/benchmarks/resnet/implementations/mxnet/run_with_docker.sh
+++ b/NVIDIA/benchmarks/resnet/implementations/mxnet/run_with_docker.sh
@@ -70,7 +70,13 @@ cleanup_docker
 trap 'set -eux; cleanup_docker' EXIT
 # Setup container
-nvidia-docker run --rm --init --detach \
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+docker run ${_docker_gpu_args} --rm --init --detach \
    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
    --ulimit=stack=67108864 --ulimit=memlock=-1 \
    --name="${_cont_name}" "${_cont_mounts[@]}" \

--- a/NVIDIA/benchmarks/unet3d/implementations/mxnet/run_with_docker.sh
+++ b/NVIDIA/benchmarks/unet3d/implementations/mxnet/run_with_docker.sh
@@ -51,7 +51,13 @@ cleanup_docker
 trap 'set -eux; cleanup_docker' EXIT
 # Setup container
-nvidia-docker run --rm --init --detach \
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+docker run ${_docker_gpu_args} --rm --init --detach \
    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
    --ulimit=stack=67108864 --ulimit=memlock=-1 \
    --name="${_cont_name}" "${_cont_mounts[@]}" \

--- a/NVIDIA/benchmarks/unet3d/implementations/mxnet/runtime/arguments.py
+++ b/NVIDIA/benchmarks/unet3d/implementations/mxnet/runtime/arguments.py
@@ -90,4 +90,16 @@ PARSER.add_argument('--input_batch_multiplier', '-ibm', dest='input_batch_multip
 PARSER.add_argument('--use_cached_loader', '-ucl', dest='use_cached_loader', action='store_true', default=False)
 PARSER.add_argument('--stick_to_shard', '-sts', dest='stick_to_shard', action='store_true', default=False)
 PARSER.add_argument('--use_nvshmem', dest='use_nvshmem', action='store_true', default=False)
\ No newline at end of file
+## Additional arguments passed
+PARSER.add_argument('--dense_seq_output', action='store_true', help='Enable dense sequential output')
+PARSER.add_argument('--pad_fmha', action='store_true', help='Enable padding for FMHA')
+PARSER.add_argument('--fused_bias_fc', action='store_true', help='Enable fused bias for FC')
+PARSER.add_argument('--fused_bias_mha', action='store_true', help='Enable fused bias for MHA')
+PARSER.add_argument('--fused_dropout_add', action='store_true', help='Enable fused dropout and add')
+PARSER.add_argument('--fused_gemm_gelu', action='store_true', help='Enable fused GEMM and GELU')
+PARSER.add_argument('--packed_samples', action='store_true', help='Enable packed samples')
+PARSER.add_argument('--use_transformer_engine2', action='store_true', help='Enable transformer engine v2')
+PARSER.add_argument('--cuda_graph_mode', type=str, help='CUDA graph mode')
+PARSER.add_argument('--use_cuda_graph', action='store_true', help='Use CUDA graph')
\ No newline at end of file