Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1c76a516
Unverified
Commit
1c76a516
authored
Nov 10, 2021
by
Stas Bekman
Committed by
GitHub
Nov 10, 2021
Browse files
solve the port conflict (#14362)
parent
9e37c5cd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
3 deletions
+25
-3
tests/deepspeed/test_deepspeed.py
tests/deepspeed/test_deepspeed.py
+25
-3
No files found.
tests/deepspeed/test_deepspeed.py
View file @
1c76a516
...
...
@@ -64,6 +64,28 @@ def load_json(path):
return
json
.
load
(
f
)
def
get_master_port
(
real_launcher
=
False
):
"""
When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
the issue is that once the port is tied it can't be used anywhere else outside of this process,
since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
This function will give the right port in the right context. For real launcher it'll give the
base port, for emulated launcher it'll give the base port + 1. In both cases a string is
returned.
Args:
`real_launcher`: whether a real launcher is going to be used, or the emulated one
"""
master_port_base
=
os
.
environ
.
get
(
"DS_TEST_PORT"
,
DEFAULT_MASTER_PORT
)
if
not
real_launcher
:
master_port_base
=
str
(
int
(
master_port_base
)
+
1
)
return
master_port_base
def
require_deepspeed_aio
(
test_case
):
"""
Decorator marking a test that requires deepspeed aio (nvme)
...
...
@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data)
num_gpus
=
min
(
2
,
get_gpu_count
())
if
distributed
else
1
master_port
=
os
.
environ
.
get
(
"DS_TEST_PORT"
,
DEFAULT_MASTER_PORT
)
master_port
=
get_master_port
(
real_launcher
=
True
)
return
f
"deepspeed --num_nodes 1 --num_gpus
{
num_gpus
}
--master_port
{
master_port
}
"
.
split
()
...
...
@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
def
setUp
(
self
):
super
().
setUp
()
master_port
=
os
.
environ
.
get
(
"DS_TEST_PORT"
,
DEFAULT_MASTER_PORT
)
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
)
...
...
@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self
.
n_epochs
=
args
.
num_train_epochs
self
.
batch_size
=
args
.
train_batch_size
master_port
=
os
.
environ
.
get
(
"DS_TEST_PORT"
,
DEFAULT_MASTER_PORT
)
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment