Unverified Commit 6855ba1c authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

Update fan out flag for pdsh (#315)

* update fan out flag for pdsh
parent 3437342c
...@@ -21,6 +21,7 @@ DLTS_HOSTFILE = "/job/hostfile" ...@@ -21,6 +21,7 @@ DLTS_HOSTFILE = "/job/hostfile"
EXPORT_ENVS = ["NCCL", "PYTHON"] EXPORT_ENVS = ["NCCL", "PYTHON"]
DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env" DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
PDSH_MAX_FAN_OUT = 1024
def parse_args(args=None): def parse_args(args=None):
...@@ -294,7 +295,9 @@ def main(args=None): ...@@ -294,7 +295,9 @@ def main(args=None):
active_workers = ",".join(active_resources.keys()) active_workers = ",".join(active_resources.keys())
logger.info("Running on the following workers: %s" % active_workers) logger.info("Running on the following workers: %s" % active_workers)
pdsh_cmd_args = ['pdsh', '-w', active_workers] # PDSH flags for max node fan out and specific hosts to launch on
# See https://linux.die.net/man/1/pdsh for flag details
pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers]
num_nodes = len(active_resources.keys()) num_nodes = len(active_resources.keys())
num_gpus_per_node = None num_gpus_per_node = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment