Unverified Commit 1c76a516 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

solve the port conflict (#14362)

parent 9e37c5cd
...@@ -64,6 +64,28 @@ def load_json(path): ...@@ -64,6 +64,28 @@ def load_json(path):
return json.load(f) return json.load(f)
def get_master_port(real_launcher=False):
"""
When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
the issue is that once the port is tied it can't be used anywhere else outside of this process,
since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
This function will give the right port in the right context. For real launcher it'll give the
base port, for emulated launcher it'll give the base port + 1. In both cases a string is
returned.
Args:
`real_launcher`: whether a real launcher is going to be used, or the emulated one
"""
master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
if not real_launcher:
master_port_base = str(int(master_port_base) + 1)
return master_port_base
def require_deepspeed_aio(test_case): def require_deepspeed_aio(test_case):
""" """
Decorator marking a test that requires deepspeed aio (nvme) Decorator marking a test that requires deepspeed aio (nvme)
...@@ -92,7 +114,7 @@ def get_launcher(distributed=False): ...@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different # 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data) # results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1 num_gpus = min(2, get_gpu_count()) if distributed else 1
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) master_port = get_master_port(real_launcher=True)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
...@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
) )
...@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self.n_epochs = args.num_train_epochs self.n_epochs = args.num_train_epochs
self.batch_size = args.train_batch_size self.batch_size = args.train_batch_size
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment