Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
dd1754fb
"docs/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "b9e1c30d0e314b563c1b7ed59c520ebf743dec9f"
Commit
dd1754fb
authored
Jan 31, 2020
by
Jeff Rasley
Browse files
add deepspeed launcher
parent
3354cff3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
126 additions
and
0 deletions
+126
-0
deepspeed/pt/deepspeed_launch.py
deepspeed/pt/deepspeed_launch.py
+126
-0
No files found.
deepspeed/pt/deepspeed_launch.py
0 → 100755
View file @
dd1754fb
"""
Copyright 2020 The Microsoft DeepSpeed Team: deepspeed@microsoft.com
"""
import
sys
import
subprocess
import
os
import
socket
import
json
import
base64
from
collections
import
defaultdict
from
argparse
import
ArgumentParser
,
REMAINDER
import
torch
def
parse_args
():
parser
=
ArgumentParser
(
description
=
"DeepSpeed distributed training launch "
"utilty that creates multiple distributed "
"processes on a single node"
)
# Optional arguments for the launch helper
parser
.
add_argument
(
"--node_rank"
,
type
=
int
,
default
=
0
,
help
=
"The rank of the node for multi-node distributed "
"training"
)
parser
.
add_argument
(
"--master_addr"
,
default
=
"127.0.0.1"
,
type
=
str
,
help
=
"Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1"
)
parser
.
add_argument
(
"--master_port"
,
default
=
29500
,
type
=
int
,
help
=
"Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training"
)
parser
.
add_argument
(
"--world_info"
,
default
=
"None"
,
type
=
str
,
help
=
"world info base64 encoded dictionary"
)
# positional
parser
.
add_argument
(
"training_script"
,
type
=
str
,
help
=
"The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script"
)
# rest from the training program
parser
.
add_argument
(
'training_script_args'
,
nargs
=
REMAINDER
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
current_env
=
os
.
environ
.
copy
()
for
k
in
current_env
.
keys
():
if
"NCCL"
in
k
:
print
(
args
.
node_rank
,
k
,
current_env
[
k
],
flush
=
True
)
world_info
=
None
assert
args
.
world_info
!=
"None"
,
"must provide world info dict"
world_info
=
base64
.
urlsafe_b64decode
(
args
.
world_info
)
world_info
=
json
.
loads
(
world_info
)
print
(
"WORLD INFO DICT: {}"
.
format
(
world_info
),
flush
=
True
)
node_list
=
list
(
world_info
.
keys
())
args
.
nnodes
=
len
(
node_list
)
local_node
=
node_list
[
args
.
node_rank
]
local_gpu_ids
=
world_info
[
local_node
]
num_local_procs
=
len
(
local_gpu_ids
)
print
(
"nnodes={}, num_local_procs={}, node_rank={}"
.
format
(
args
.
nnodes
,
num_local_procs
,
args
.
node_rank
),
flush
=
True
)
global_rank_mapping
=
defaultdict
(
list
)
curr_global_rank
=
0
dist_world_size
=
0
for
node_id
in
node_list
:
gids
=
world_info
[
node_id
]
dist_world_size
+=
len
(
gids
)
for
gid
in
gids
:
global_rank_mapping
[
node_id
].
append
(
curr_global_rank
)
curr_global_rank
+=
1
print
(
"global_rank_mapping={}"
.
format
(
global_rank_mapping
),
flush
=
True
)
print
(
"dist_world_size={}"
.
format
(
dist_world_size
),
flush
=
True
)
current_env
[
"CUDA_VISIBLE_DEVICES"
]
=
","
.
join
(
map
(
str
,
local_gpu_ids
))
print
(
"Setting CUDA_VISIBLE_DEVICES={}"
.
format
(
current_env
[
"CUDA_VISIBLE_DEVICES"
]),
flush
=
True
)
exclusion_counts_per_node
=
None
# set PyTorch distributed related environmental variables
current_env
[
"MASTER_ADDR"
]
=
args
.
master_addr
current_env
[
"MASTER_PORT"
]
=
str
(
args
.
master_port
)
current_env
[
"WORLD_SIZE"
]
=
str
(
dist_world_size
)
processes
=
[]
for
local_rank
in
range
(
0
,
num_local_procs
):
# each process's rank
dist_rank
=
global_rank_mapping
[
local_node
][
local_rank
]
current_env
[
"RANK"
]
=
str
(
dist_rank
)
# spawn the processes
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
,
"--local_rank={}"
.
format
(
local_rank
)
]
+
args
.
training_script_args
process
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
processes
.
append
(
process
)
for
process
in
processes
:
process
.
wait
()
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment