Commit b93fbddc authored by Ribin-Baby's avatar Ribin-Baby
Browse files

dug fix update

parent 5b62935b
......@@ -92,7 +92,13 @@ cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
nvidia-docker run --rm --init --detach --gpus='"'device=${NV_GPU}'"' \
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--ulimit=stack=67108864 --ulimit=memlock=-1 \
--name="${_cont_name}" ${_cont_mounts[@]} \
......
......@@ -60,7 +60,13 @@ cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
nvidia-docker run --rm --init --detach \
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--name="${_cont_name}" "${_cont_mounts[@]}" \
"${CONT}" sleep infinity
......
......@@ -60,7 +60,13 @@ cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
nvidia-docker run --rm --init --detach \
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--ulimit=stack=67108864 --ulimit=memlock=-1 \
--name="${_cont_name}" "${_cont_mounts[@]}" \
......
......@@ -53,7 +53,9 @@ import cuda_graphs.graph_wrapper as graph_wrapper
from common.data import SyntheticDataIter
from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge
# from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge
from mlperf_common.scaleoutbridge import init_bridge, ScaleoutBridgeBase as SBridge
from mlperf_common.frameworks.mxnet import MXNetProfilerHandler, MPICommunicationHandler
TRAIN_CUDA_GRAPH_ID = 0
......@@ -907,7 +909,8 @@ def mlperf_fit(self, args, train_data,
key=mllogger.constants.BLOCK_START,
metadata={'first_epoch_num': block_epoch_start + 1, 'epoch_count': block_epoch_count})
sbridge = init_bridge(hvd.rank())
#sbridge = init_bridge(hvd.rank())
sbridge = init_bridge(MXNetProfilerHandler(), MPICommunicationHandler(), mllogger)
################################################################################
# training loop with dali overlap with fwd
......
......@@ -48,7 +48,8 @@ from mxnet.ndarray import sparse
#####
from mxnet import cuda_utils as cu
from scaleoutbridge import ScaleoutBridge as SBridge
# from scaleoutbridge import ScaleoutBridge as SBridge
from mlperf_common.scaleoutbridge import ScaleoutBridgeBase as SBridge
from common.data import SyntheticDataIter
......
......@@ -70,7 +70,13 @@ cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
nvidia-docker run --rm --init --detach \
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--ulimit=stack=67108864 --ulimit=memlock=-1 \
--name="${_cont_name}" "${_cont_mounts[@]}" \
......
......@@ -51,7 +51,13 @@ cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
nvidia-docker run --rm --init --detach \
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--ulimit=stack=67108864 --ulimit=memlock=-1 \
--name="${_cont_name}" "${_cont_mounts[@]}" \
......
......@@ -90,4 +90,16 @@ PARSER.add_argument('--input_batch_multiplier', '-ibm', dest='input_batch_multip
PARSER.add_argument('--use_cached_loader', '-ucl', dest='use_cached_loader', action='store_true', default=False)
PARSER.add_argument('--stick_to_shard', '-sts', dest='stick_to_shard', action='store_true', default=False)
PARSER.add_argument('--use_nvshmem', dest='use_nvshmem', action='store_true', default=False)
\ No newline at end of file
PARSER.add_argument('--use_nvshmem', dest='use_nvshmem', action='store_true', default=False)
## Additional arguments passed
PARSER.add_argument('--dense_seq_output', action='store_true', help='Enable dense sequential output')
PARSER.add_argument('--pad_fmha', action='store_true', help='Enable padding for FMHA')
PARSER.add_argument('--fused_bias_fc', action='store_true', help='Enable fused bias for FC')
PARSER.add_argument('--fused_bias_mha', action='store_true', help='Enable fused bias for MHA')
PARSER.add_argument('--fused_dropout_add', action='store_true', help='Enable fused dropout and add')
PARSER.add_argument('--fused_gemm_gelu', action='store_true', help='Enable fused GEMM and GELU')
PARSER.add_argument('--packed_samples', action='store_true', help='Enable packed samples')
PARSER.add_argument('--use_transformer_engine2', action='store_true', help='Enable transformer engine v2')
PARSER.add_argument('--cuda_graph_mode', type=str, help='CUDA graph mode')
PARSER.add_argument('--use_cuda_graph', action='store_true', help='Use CUDA graph')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment