solve the port conflict (#14362)

1c76a516 · Stas Bekman · GitHub · 9e37c5cd · 1c76a516
Unverified Commit 1c76a516 authored Nov 10, 2021 by Stas Bekman Committed by GitHub Nov 10, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 3 deletions

tests/deepspeed/test_deepspeed.py tests/deepspeed/test_deepspeed.py +25 -3

No files found.
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -64,6 +64,28 @@ def load_json(path):
        return json.load(f)


+def get_master_port(real_launcher=False):
+    """
+    When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
+    the issue is that once the port is tied it can't be used anywhere else outside of this process,
+    since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
+    able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
+
+    This function will give the right port in the right context. For real launcher it'll give the
+    base port, for emulated launcher it'll give the base port + 1. In both cases a string is
+    returned.
+
+    Args:
+        `real_launcher`: whether a real launcher is going to be used, or the emulated one
+
+    """
+
+    master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    if not real_launcher:
+        master_port_base = str(int(master_port_base) + 1)
+    return master_port_base
+
+
 def require_deepspeed_aio(test_case):
    """
    Decorator marking a test that requires deepspeed aio (nvme)
@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
    # 2. for now testing with just 2 gpus max (since some quality tests may give different
    # results with mode gpus because we use very little data)
    num_gpus = min(2, get_gpu_count()) if distributed else 1
-    master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    master_port = get_master_port(real_launcher=True)
    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()


@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
    def setUp(self):
        super().setUp()

-        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+        master_port = get_master_port(real_launcher=False)
        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )
@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

-        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+        master_port = get_master_port(real_launcher=False)
        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )