Unverified Commit 67f2aa72 authored by pnunna93's avatar pnunna93 Committed by GitHub
Browse files

Benchmarks: model benchmarks - change torch.distributed.launch to torchrun (#556)

This PR has following changes
- torch.distributed.launch changed to torchrun. torch.distributed.launch
is deprecated in latest Pytorch and is recommended to move to torchrun -
https://pytorch.org/docs/stable/elastic/run.html


- Changes to AMD GPU detection logic. The AMD GPU detection logic throws
warning when containers have only renderD in /dev/dri, this change would
resolve those warnings

---------
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
parent e1df877b
......@@ -26,7 +26,7 @@ def get_vendor(self):
logger.warning('Cannot find NVIDIA GPU device.')
return 'nvidia'
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
if not list(Path('/dev/dri').glob('card*')):
if not list(Path('/dev/dri').glob('renderD*')):
logger.warning('Cannot find AMD GPU device.')
return 'amd'
if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):
......
......@@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
torch_dist_params = '' if mode.node_num == 1 else \
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
mode_command = (
f'python3 -m torch.distributed.launch'
f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f'torchrun'
f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
)
......
......@@ -105,8 +105,8 @@ def test_get_mode_command(self):
'node_num': 'all',
},
'expected_command': (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=1 '
'torchrun '
'--no_python --nproc_per_node=1 '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
......@@ -123,8 +123,8 @@ def test_get_mode_command(self):
'node_num': 1,
},
'expected_command': (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 '
'torchrun '
'--no_python --nproc_per_node=8 '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment