Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
cf7efd4f
Commit
cf7efd4f
authored
Aug 30, 2021
by
Sangkug Lym
Committed by
slym
Sep 02, 2021
Browse files
t # This is a combination of 2 commits.
allreduce overlap with wgrad gemm change custom delay to dummy add
parent
b7ae685f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
6 deletions
+66
-6
megatron/arguments.py
megatron/arguments.py
+4
-0
megatron/initialize.py
megatron/initialize.py
+12
-1
megatron/mpu/layers.py
megatron/mpu/layers.py
+50
-5
No files found.
megatron/arguments.py
View file @
cf7efd4f
...
@@ -462,6 +462,10 @@ def _add_training_args(parser):
...
@@ -462,6 +462,10 @@ def _add_training_args(parser):
group
.
add_argument
(
'--dataloader-type'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--dataloader-type'
,
type
=
str
,
default
=
None
,
choices
=
[
'single'
,
'cyclic'
],
choices
=
[
'single'
,
'cyclic'
],
help
=
'Single pass vs multiple pass data loader'
)
help
=
'Single pass vs multiple pass data loader'
)
group
.
add_argument
(
'--async-tensor-parallel-allreduce'
,
action
=
'store_true'
,
help
=
'Enable asynchronous excution of tensor-parallel allreduce '
'with other GPU operators'
,
dest
=
'async_tensor_parallel_allreduce'
)
return
parser
return
parser
...
...
megatron/initialize.py
View file @
cf7efd4f
...
@@ -176,11 +176,22 @@ def _initialize_distributed():
...
@@ -176,11 +176,22 @@ def _initialize_distributed():
else
:
else
:
args
.
local_rank
=
device
args
.
local_rank
=
device
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
# Increase cuda stream priority of NCCL ops when overlapping with other ops
if
(
args
.
async_tensor_parallel_allreduce
and
args
.
tensor_model_parallel_size
>
1
):
from
torch._C._distributed_c10d
import
ProcessGroupNCCL
pg_options
=
ProcessGroupNCCL
.
Options
()
pg_options
.
is_high_priority_stream
=
True
pg_options
.
_timeout
=
timedelta
(
days
=
7
)
else
:
pg_options
=
None
# Call the init process
# Call the init process
torch
.
distributed
.
init_process_group
(
torch
.
distributed
.
init_process_group
(
backend
=
args
.
distributed_backend
,
backend
=
args
.
distributed_backend
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
,
timeout
=
timedelta
(
days
=
7
))
timeout
=
timedelta
(
days
=
7
),
pg_options
=
pg_options
)
# Set the tensor model-parallel, pipeline model-parallel, and
# Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators.
# data-parallel communicators.
...
...
megatron/mpu/layers.py
View file @
cf7efd4f
...
@@ -27,6 +27,7 @@ from torch.nn.parameter import Parameter
...
@@ -27,6 +27,7 @@ from torch.nn.parameter import Parameter
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_world_size
from
.initialize
import
get_tensor_model_parallel_world_size
from
.initialize
import
get_tensor_model_parallel_group
from
.mappings
import
copy_to_tensor_model_parallel_region
from
.mappings
import
copy_to_tensor_model_parallel_region
from
.mappings
import
gather_from_tensor_model_parallel_region
from
.mappings
import
gather_from_tensor_model_parallel_region
from
.mappings
import
reduce_from_tensor_model_parallel_region
from
.mappings
import
reduce_from_tensor_model_parallel_region
...
@@ -198,6 +199,37 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -198,6 +199,37 @@ class VocabParallelEmbedding(torch.nn.Module):
return
output
return
output
class
ColumnParallelLinearFunction
(
torch
.
autograd
.
Function
):
"""
Column-parallel linear layer execution with asynchronous all-reduce
execution in backprop.
"""
@
staticmethod
def
forward
(
ctx
,
input
,
weight
,
bias
,
use_bias
):
ctx
.
save_for_backward
(
input
,
weight
)
ctx
.
use_bias
=
use_bias
output
=
torch
.
matmul
(
input
,
weight
.
t
())
if
use_bias
:
output
=
output
+
bias
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
weight
=
ctx
.
saved_tensors
use_bias
=
ctx
.
use_bias
grad_input
=
grad_output
.
matmul
(
weight
)
# Asyncronous all-reduce
handle
=
torch
.
distributed
.
all_reduce
(
grad_input
,
group
=
get_tensor_model_parallel_group
(),
async_op
=
True
)
# Delay the start of weight gradient computation shortly (3us) to have
# all-reduce scheduled first and have GPU resources allocated
_
=
torch
.
empty
(
1
,
device
=
grad_output
.
device
)
+
1
grad_weight
=
grad_output
.
t
().
matmul
(
input
)
grad_bias
=
grad_output
.
sum
(
dim
=
0
)
if
use_bias
else
None
handle
.
wait
()
return
grad_input
,
grad_weight
,
grad_bias
,
None
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with column parallelism.
"""Linear layer with column parallelism.
...
@@ -272,16 +304,29 @@ class ColumnParallelLinear(torch.nn.Module):
...
@@ -272,16 +304,29 @@ class ColumnParallelLinear(torch.nn.Module):
self
.
bias
.
zero_
()
self
.
bias
.
zero_
()
else
:
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
register_parameter
(
'bias'
,
None
)
self
.
async_tensor_parallel_allreduce
=
(
args
.
async_tensor_parallel_allreduce
and
world_size
>
1
)
def
forward
(
self
,
input_
):
def
forward
(
self
,
input_
):
# Set up backprop all-reduce.
input_parallel
=
copy_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
output_parallel
=
F
.
linear
(
input_parallel
,
self
.
weight
,
bias
)
if
self
.
async_tensor_parallel_allreduce
:
input_shape
=
input_
.
shape
input_
=
input_
.
view
(
input_shape
[
0
]
*
input_shape
[
1
],
input_shape
[
2
])
# Maxtrix multiply with asynchronouse tensor-parallel all-reduce execution
output_parallel
=
ColumnParallelLinearFunction
.
apply
(
input_
,
self
.
weight
,
bias
,
bias
is
not
None
)
output_parallel
=
output_parallel
.
view
(
input_shape
[
0
],
input_shape
[
1
],
output_parallel
.
shape
[
1
])
else
:
# Set up backprop all-reduce.
input_parallel
=
copy_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
output_parallel
=
F
.
linear
(
input_parallel
,
self
.
weight
,
bias
)
if
self
.
gather_output
:
if
self
.
gather_output
:
# All-gather across the partitions.
# All-gather across the partitions.
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment