Commit 91fcbd68 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'torch_dist_env_init' into 'main'

changed torch distributed init method from tcp to env

See merge request ADLR/megatron-lm!297
parents 78a69eaf ee4fd509
......@@ -176,14 +176,10 @@ def _initialize_distributed():
args.local_rank = device
torch.cuda.set_device(device)
# Call the init process
init_method = 'tcp://'
master_ip = os.getenv('MASTER_ADDR', 'localhost')
master_port = os.getenv('MASTER_PORT', '6000')
init_method += master_ip + ':' + master_port
torch.distributed.init_process_group(
backend=args.distributed_backend,
world_size=args.world_size, rank=args.rank,
init_method=init_method)
world_size=args.world_size, rank=args.rank)
# Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment