"tests/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "fa1ae3b7c50c35e345ff22da02e84bc8b1351ac4"
Unverified Commit bcb988bd authored by Chao Ma's avatar Chao Ma Committed by GitHub
Browse files

[Distributed] Small fix on launch script (#1867)

* update

* update

* update

* update
parent 6963d796
...@@ -26,7 +26,9 @@ will be able to access the partitioned data. ...@@ -26,7 +26,9 @@ will be able to access the partitioned data.
### Step 3: Launch distributed jobs ### Step 3: Launch distributed jobs
First make sure that the master node has the right permission to ssh to all the other nodes. Then run script: First make sure that the master node has the right permission to ssh to all the other nodes. Change the `ip_config.txt` file by using your own instance IP.
Then run script:
```bash ```bash
python3 ~/dgl/tools/launch.py \ python3 ~/dgl/tools/launch.py \
......
...@@ -63,11 +63,15 @@ def submit_jobs(args, udf_command): ...@@ -63,11 +63,15 @@ def submit_jobs(args, udf_command):
torch_cmd = torch_cmd + ' ' + '--node_rank=' + str(0) torch_cmd = torch_cmd + ' ' + '--node_rank=' + str(0)
torch_cmd = torch_cmd + ' ' + '--master_addr=' + str(hosts[0][0]) torch_cmd = torch_cmd + ' ' + '--master_addr=' + str(hosts[0][0])
torch_cmd = torch_cmd + ' ' + '--master_port=' + str(1234) torch_cmd = torch_cmd + ' ' + '--master_port=' + str(1234)
for node_id, host in enumerate(hosts): for node_id, host in enumerate(hosts):
ip, _ = host ip, _ = host
new_torch_cmd = torch_cmd.replace('node_rank=0', 'node_rank='+str(node_id)) new_torch_cmd = torch_cmd.replace('node_rank=0', 'node_rank='+str(node_id))
new_udf_command = udf_command.replace('python3', 'python3 ' + new_torch_cmd) if 'python3' in udf_command:
new_udf_command = udf_command.replace('python3', 'python3 ' + new_torch_cmd)
elif 'python2' in udf_command:
new_udf_command = udf_command.replace('python2', 'python2 ' + new_torch_cmd)
else:
new_udf_command = udf_command.replace('python', 'python ' + new_torch_cmd)
cmd = client_cmd + ' ' + new_udf_command cmd = client_cmd + ' ' + new_udf_command
cmd = 'cd ' + str(args.workspace) + '; ' + cmd cmd = 'cd ' + str(args.workspace) + '; ' + cmd
execute_remote(cmd, ip, thread_list) execute_remote(cmd, ip, thread_list)
...@@ -84,16 +88,15 @@ def main(): ...@@ -84,16 +88,15 @@ def main():
parser.add_argument('--num_client', type=int, parser.add_argument('--num_client', type=int,
help='Total number of client processes in the cluster') help='Total number of client processes in the cluster')
parser.add_argument('--conf_path', type=str, parser.add_argument('--conf_path', type=str,
help='The path to the partition config file. This path can be \ help='The file (in workspace) of the partition config file')
a remote path like s3 and dgl will download this file automatically')
parser.add_argument('--ip_config', type=str, parser.add_argument('--ip_config', type=str,
help='The file for IP configuration for server processes') help='The file (in workspace) of IP configuration for server processes')
args, udf_command = parser.parse_known_args() args, udf_command = parser.parse_known_args()
assert len(udf_command) == 1, 'Please provide user command line.' assert len(udf_command) == 1, 'Please provide user command line.'
assert args.num_client > 0, '--num_client must be a positive number.' assert args.num_client > 0, '--num_client must be a positive number.'
udf_command = str(udf_command[0]) udf_command = str(udf_command[0])
if 'python' not in udf_command: if 'python' not in udf_command:
raise RuntimeError("DGL launch can only support: python ...") raise RuntimeError("DGL launching script can only support Python executable file.")
submit_jobs(args, udf_command) submit_jobs(args, udf_command)
def signal_handler(signal, frame): def signal_handler(signal, frame):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment