Commit 1757ef69 authored by Myle Ott's avatar Myle Ott Committed by Facebook Github Bot
Browse files

Better distributed init for SLURM

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/696

Differential Revision: D16068394

Pulled By: myleott

fbshipit-source-id: 92b44470ab8aeb9f99838cf74e34176104eb2b87
parent b5949373
...@@ -37,7 +37,9 @@ def infer_init_method(args): ...@@ -37,7 +37,9 @@ def infer_init_method(args):
# we can determine the init method automatically for Slurm # we can determine the init method automatically for Slurm
elif args.distributed_port > 0: elif args.distributed_port > 0:
node_list = os.environ.get('SLURM_JOB_NODELIST') node_list = os.environ.get('SLURM_STEP_NODELIST')
if node_list is None:
node_list = os.environ.get('SLURM_JOB_NODELIST')
if node_list is not None: if node_list is not None:
try: try:
hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list]) hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment