Unverified Commit 1c76a516 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

solve the port conflict (#14362)

parent 9e37c5cd
......@@ -64,6 +64,28 @@ def load_json(path):
return json.load(f)
def get_master_port(real_launcher=False):
"""
When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
the issue is that once the port is tied it can't be used anywhere else outside of this process,
since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
This function will give the right port in the right context. For real launcher it'll give the
base port, for emulated launcher it'll give the base port + 1. In both cases a string is
returned.
Args:
`real_launcher`: whether a real launcher is going to be used, or the emulated one
"""
master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
if not real_launcher:
master_port_base = str(int(master_port_base) + 1)
return master_port_base
def require_deepspeed_aio(test_case):
"""
Decorator marking a test that requires deepspeed aio (nvme)
......@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=True)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
......@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
def setUp(self):
super().setUp()
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
......@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self.n_epochs = args.num_train_epochs
self.batch_size = args.train_batch_size
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment