"git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "d21c142eb89ce42817165368641329072e2ad8fb"
Unverified Commit 890ce65d authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Bug - Fix torch.distributed command for single node (#201)

Fix `torch.distributed` command for single node.
parent f91f97b6
...@@ -123,20 +123,13 @@ def __get_mode_command(self, benchmark_name, mode): ...@@ -123,20 +123,13 @@ def __get_mode_command(self, benchmark_name, mode):
elif mode.name == 'torch.distributed': elif mode.name == 'torch.distributed':
# TODO: replace with torch.distributed.run in v1.9 # TODO: replace with torch.distributed.run in v1.9
# TODO: only supports node_num=1 and node_num=all currently # TODO: only supports node_num=1 and node_num=all currently
torch_dist_params = '' if mode.node_num == 1 else \
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
mode_command = ( mode_command = (
'python3 -m torch.distributed.launch ' f'python3 -m torch.distributed.launch'
'--use_env --no_python --nproc_per_node={proc_num} ' f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
'--nnodes={node_num} --node_rank=$NODE_RANK ' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
'{command} {torch_distributed_suffix}'
).format(
proc_num=mode.proc_num,
node_num=1 if mode.node_num == 1 else '$NNODES',
command=exec_command,
torch_distributed_suffix=(
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).format(name=benchmark_name),
) )
elif mode.name == 'mpi': elif mode.name == 'mpi':
mode_command = ( mode_command = (
......
...@@ -116,8 +116,6 @@ def test_get_mode_command(self): ...@@ -116,8 +116,6 @@ def test_get_mode_command(self):
'expected_command': ( 'expected_command': (
'python3 -m torch.distributed.launch ' 'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 ' '--use_env --no_python --nproc_per_node=8 '
'--nnodes=1 --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp ' 'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl' 'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment