[deepspeed] Enable multiple test runs on single box, defer to DS_TEST_PORT if set (#14331)

* defer to DS_TEST_PORT if set * style Co-authored-by: Stas Bekman <stas@stason.org>

[deepspeed] Enable multiple test runs on single box, defer to DS_TEST_PORT if set (#14331)
* defer to DS_TEST_PORT if set * style Co-authored-by: Stas Bekman <stas@stason.org>
d0e96c6d · Jeff Rasley · GitHub · dfb00bf6 · d0e96c6d · d0e96c6d
Unverified Commit d0e96c6d authored Nov 08, 2021 by Jeff Rasley Committed by GitHub Nov 08, 2021
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 4 deletions

tests/deepspeed/test_deepspeed.py tests/deepspeed/test_deepspeed.py +9 -3

tests/deepspeed/test_model_zoo.py tests/deepspeed/test_model_zoo.py +5 -1

No files found.
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -51,6 +51,9 @@ with ExtendSysPath(tests_dir):

 set_seed(42)

+# default torch.distributed port
+DEFAULT_MASTER_PORT = "10999"
+
 T5_SMALL = "t5-small"
 T5_TINY = "patrickvonplaten/t5-tiny-random"
 GPT2_TINY = "sshleifer/tiny-gpt2"
@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
    # 2. for now testing with just 2 gpus max (since some quality tests may give different
    # results with mode gpus because we use very little data)
    num_gpus = min(2, get_gpu_count()) if distributed else 1
-    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+    master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()


 ZERO2 = "zero2"
@@ -107,8 +111,9 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
    def setUp(self):
        super().setUp()

+        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )

    def test_init_zero3(self):
@@ -176,8 +181,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

+        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )

        self.ds_config_file = dict(

--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -41,6 +41,9 @@ with ExtendSysPath(tests_dir):

 set_seed(42)

+# default torch.distributed port
+DEFAULT_MASTER_PORT = "10999"
+
 # translation
 FSMT_TINY = "stas/tiny-wmt19-en-de"
 BART_TINY = "sshleifer/bart-tiny-random"
@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
    # 2. for now testing with just 2 gpus max (since some quality tests may give different
    # results with mode gpus because we use very little data)
    num_gpus = min(2, get_gpu_count()) if distributed else 1
-    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+    master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()


 def make_task_cmds():