Unverified Commit 67f2aa72 authored by pnunna93's avatar pnunna93 Committed by GitHub
Browse files

Benchmarks: model benchmarks - change torch.distributed.launch to torchrun (#556)

This PR has following changes
- torch.distributed.launch changed to torchrun. torch.distributed.launch
is deprecated in latest Pytorch and is recommended to move to torchrun -
https://pytorch.org/docs/stable/elastic/run.html


- Changes to AMD GPU detection logic. The AMD GPU detection logic throws
warning when containers have only renderD in /dev/dri, this change would
resolve those warnings

---------
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
parent e1df877b
...@@ -26,7 +26,7 @@ def get_vendor(self): ...@@ -26,7 +26,7 @@ def get_vendor(self):
logger.warning('Cannot find NVIDIA GPU device.') logger.warning('Cannot find NVIDIA GPU device.')
return 'nvidia' return 'nvidia'
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir(): if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
if not list(Path('/dev/dri').glob('card*')): if not list(Path('/dev/dri').glob('renderD*')):
logger.warning('Cannot find AMD GPU device.') logger.warning('Cannot find AMD GPU device.')
return 'amd' return 'amd'
if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')): if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):
......
...@@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): ...@@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
torch_dist_params = '' if mode.node_num == 1 else \ torch_dist_params = '' if mode.node_num == 1 else \
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
mode_command = ( mode_command = (
f'python3 -m torch.distributed.launch' f'torchrun'
f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
) )
......
...@@ -105,8 +105,8 @@ def test_get_mode_command(self): ...@@ -105,8 +105,8 @@ def test_get_mode_command(self):
'node_num': 'all', 'node_num': 'all',
}, },
'expected_command': ( 'expected_command': (
'python3 -m torch.distributed.launch ' 'torchrun '
'--use_env --no_python --nproc_per_node=1 ' '--no_python --nproc_per_node=1 '
'--nnodes=$NNODES --node_rank=$NODE_RANK ' '--nnodes=$NNODES --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
...@@ -123,8 +123,8 @@ def test_get_mode_command(self): ...@@ -123,8 +123,8 @@ def test_get_mode_command(self):
'node_num': 1, 'node_num': 1,
}, },
'expected_command': ( 'expected_command': (
'python3 -m torch.distributed.launch ' 'torchrun '
'--use_env --no_python --nproc_per_node=8 ' '--no_python --nproc_per_node=8 '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp ' 'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl' 'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment