"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b4d4d6fe87ffcd7508307970cdf8fa3eda288701"
Unverified Commit d0e96c6d authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

[deepspeed] Enable multiple test runs on single box, defer to DS_TEST_PORT if set (#14331)



* defer to DS_TEST_PORT if set

* style
Co-authored-by: default avatarStas Bekman <stas@stason.org>
parent dfb00bf6
...@@ -51,6 +51,9 @@ with ExtendSysPath(tests_dir): ...@@ -51,6 +51,9 @@ with ExtendSysPath(tests_dir):
set_seed(42) set_seed(42)
# default torch.distributed port
DEFAULT_MASTER_PORT = "10999"
T5_SMALL = "t5-small" T5_SMALL = "t5-small"
T5_TINY = "patrickvonplaten/t5-tiny-random" T5_TINY = "patrickvonplaten/t5-tiny-random"
GPT2_TINY = "sshleifer/tiny-gpt2" GPT2_TINY = "sshleifer/tiny-gpt2"
...@@ -89,7 +92,8 @@ def get_launcher(distributed=False): ...@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different # 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data) # results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1 num_gpus = min(2, get_gpu_count()) if distributed else 1
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
ZERO2 = "zero2" ZERO2 = "zero2"
...@@ -107,8 +111,9 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -107,8 +111,9 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
) )
def test_init_zero3(self): def test_init_zero3(self):
...@@ -176,8 +181,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -176,8 +181,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self.n_epochs = args.num_train_epochs self.n_epochs = args.num_train_epochs
self.batch_size = args.train_batch_size self.batch_size = args.train_batch_size
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
) )
self.ds_config_file = dict( self.ds_config_file = dict(
......
...@@ -41,6 +41,9 @@ with ExtendSysPath(tests_dir): ...@@ -41,6 +41,9 @@ with ExtendSysPath(tests_dir):
set_seed(42) set_seed(42)
# default torch.distributed port
DEFAULT_MASTER_PORT = "10999"
# translation # translation
FSMT_TINY = "stas/tiny-wmt19-en-de" FSMT_TINY = "stas/tiny-wmt19-en-de"
BART_TINY = "sshleifer/bart-tiny-random" BART_TINY = "sshleifer/bart-tiny-random"
...@@ -89,7 +92,8 @@ def get_launcher(distributed=False): ...@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different # 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data) # results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1 num_gpus = min(2, get_gpu_count()) if distributed else 1
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
def make_task_cmds(): def make_task_cmds():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment