Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
852c524a
Unverified
Commit
852c524a
authored
Jan 25, 2021
by
sdtblck
Committed by
GitHub
Jan 25, 2021
Browse files
Add optional timeout parameter to deepspeed.init_distributed (#637)
Co-authored-by:
Jeff Rasley
<
jerasley@microsoft.com
>
parent
34c83a5a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
12 deletions
+21
-12
deepspeed/constants.py
deepspeed/constants.py
+8
-0
deepspeed/utils/distributed.py
deepspeed/utils/distributed.py
+13
-12
No files found.
deepspeed/constants.py
View file @
852c524a
'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
from
datetime
import
timedelta
#############################################
# Torch distributed constants
#############################################
TORCH_DISTRIBUTED_DEFAULT_PORT
=
29500
# Default process group wide timeout, if applicable.
# This only applies to the gloo and nccl backends
# (only if NCCL_BLOCKING_WAIT or NCCL_ASYNC_ERROR_HANDLING is set to 1).
# To make an attempt at backwards compatibility with THD, we use an
# extraordinarily high default timeout, given that THD did not have timeouts.
default_pg_timeout
=
timedelta
(
minutes
=
30
)
deepspeed/utils/distributed.py
View file @
852c524a
...
...
@@ -3,25 +3,25 @@ Copyright 2020 The Microsoft DeepSpeed Team
'''
import
os
import
torch
from
datetime
import
timedelta
from
.logging
import
logger
from
..constants
import
TORCH_DISTRIBUTED_DEFAULT_PORT
from
..constants
import
TORCH_DISTRIBUTED_DEFAULT_PORT
,
default_pg_timeout
def
init_distributed
(
dist_backend
=
"nccl"
,
auto_mpi_discovery
=
True
,
distributed_port
=
TORCH_DISTRIBUTED_DEFAULT_PORT
,
verbose
=
True
):
"""Initialize torch.distributed backend, potentially performing MPI discovery if needed.
verbose
=
True
,
timeout
=
default_pg_timeout
):
"""
Initialize torch.distributed backend, potentially performing MPI discovery if needed
Arguments:
dist_backend: torch distributed backend, e.g., nccl, mpi, gloo
auto_mpi_discovery: if distributed environment variables are not set, attempt to discover them from MPI
distributed_port: torch distributed backend port
verbose: verbose logging
dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo
auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI
distributed_port (int, optional): torch distributed backend port
verbose (bool, optional): verbose logging
timeout (timedelta, optional): Timeout for operations executed against the process group. Default value equals 30 minutes.
"""
required_env
=
[
"RANK"
,
"WORLD_SIZE"
,
"MASTER_ADDR"
,
"MASTER_PORT"
,
"LOCAL_RANK"
]
if
auto_mpi_discovery
and
not
all
(
map
(
lambda
v
:
v
in
os
.
environ
,
required_env
)):
...
...
@@ -38,7 +38,8 @@ def init_distributed(dist_backend="nccl",
if
verbose
:
logger
.
info
(
"Initializing torch distributed with backend: {}"
.
format
(
dist_backend
))
torch
.
distributed
.
init_process_group
(
backend
=
dist_backend
)
assert
isinstance
(
timeout
,
timedelta
)
torch
.
distributed
.
init_process_group
(
backend
=
dist_backend
,
timeout
=
timeout
)
def
mpi_discovery
(
distributed_port
=
TORCH_DISTRIBUTED_DEFAULT_PORT
,
verbose
=
True
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment