Benchmarks: model benchmarks - change torch.distributed.launch to torchrun (#556)

This PR has following changes - torch.distributed.launch changed to torchrun. torch.distributed.launch is deprecated in latest Pytorch and is recommended to move to torchrun - https://pytorch.org/docs/stable/elastic/run.html - Changes to AMD GPU detection logic. The AMD GPU detection logic throws warning when containers have only renderD in /dev/dri, this change would resolve those warnings --------- Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>

Benchmarks: model benchmarks - change torch.distributed.launch to torchrun (#556)
This PR has following changes - torch.distributed.launch changed to torchrun. torch.distributed.launch is deprecated in latest Pytorch and is recommended to move to torchrun - https://pytorch.org/docs/stable/elastic/run.html - Changes to AMD GPU detection logic. The AMD GPU detection logic throws warning when containers have only renderD in /dev/dri, this change would resolve those warnings --------- Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
67f2aa72 · pnunna93 · GitHub · e1df877b · 67f2aa72 · 67f2aa72
Unverified Commit 67f2aa72 authored Aug 08, 2023 by pnunna93 Committed by GitHub Aug 08, 2023
Showing with 7 additions and 7 deletions

superbench/common/devices/gpu.py superbench/common/devices/gpu.py +1 -1

superbench/runner/runner.py superbench/runner/runner.py +2 -2

tests/runner/test_runner.py tests/runner/test_runner.py +4 -4

No files found.
--- a/superbench/common/devices/gpu.py
+++ b/superbench/common/devices/gpu.py
@@ -26,7 +26,7 @@ def get_vendor(self):
                logger.warning('Cannot find NVIDIA GPU device.')
            return 'nvidia'
        if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
-            if not list(Path('/dev/dri').glob('card*')):
+            if not list(Path('/dev/dri').glob('renderD*')):
                logger.warning('Cannot find AMD GPU device.')
            return 'amd'
        if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):

--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
            torch_dist_params = '' if mode.node_num == 1 else \
                '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
            mode_command = (
-                f'python3 -m torch.distributed.launch'
+                f'torchrun'
-                f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
+                f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
                f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
                f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
            )

--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@@ -105,8 +105,8 @@ def test_get_mode_command(self):
                    'node_num': 'all',
                },
                'expected_command': (
-                    'python3 -m torch.distributed.launch '
+                    'torchrun '
-                    '--use_env --no_python --nproc_per_node=1 '
+                    '--no_python --nproc_per_node=1 '
                    '--nnodes=$NNODES --node_rank=$NODE_RANK '
                    '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
                    f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
@@ -123,8 +123,8 @@ def test_get_mode_command(self):
                    'node_num': 1,
                },
                'expected_command': (
-                    'python3 -m torch.distributed.launch '
+                    'torchrun '
-                    '--use_env --no_python --nproc_per_node=8 '
+                    '--no_python --nproc_per_node=8 '
                    f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
                    'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
                    'superbench.benchmarks.foo.parameters.distributed_backend=nccl'