Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
4df8b7a2
Commit
4df8b7a2
authored
Sep 02, 2021
by
slym
Browse files
reflect feedback
parent
3f652469
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
5 deletions
+5
-5
megatron/mpu/layers.py
megatron/mpu/layers.py
+5
-5
No files found.
megatron/mpu/layers.py
View file @
4df8b7a2
...
@@ -205,11 +205,11 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
...
@@ -205,11 +205,11 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
execution in backprop.
execution in backprop.
"""
"""
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
input
,
weight
,
bias
,
use_
bias
):
def
forward
(
ctx
,
input
,
weight
,
bias
):
ctx
.
save_for_backward
(
input
,
weight
)
ctx
.
save_for_backward
(
input
,
weight
)
ctx
.
use_bias
=
use_
bias
ctx
.
use_bias
=
bias
is
not
None
output
=
torch
.
matmul
(
input
,
weight
.
t
())
output
=
torch
.
matmul
(
input
,
weight
.
t
())
if
use_
bias
:
if
bias
is
not
None
:
output
=
output
+
bias
output
=
output
+
bias
return
output
return
output
...
@@ -227,7 +227,7 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
...
@@ -227,7 +227,7 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
grad_weight
=
grad_output
.
t
().
matmul
(
input
)
grad_weight
=
grad_output
.
t
().
matmul
(
input
)
grad_bias
=
grad_output
.
sum
(
dim
=
0
)
if
use_bias
else
None
grad_bias
=
grad_output
.
sum
(
dim
=
0
)
if
use_bias
else
None
handle
.
wait
()
handle
.
wait
()
return
grad_input
,
grad_weight
,
grad_bias
,
None
return
grad_input
,
grad_weight
,
grad_bias
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
...
@@ -318,7 +318,7 @@ class ColumnParallelLinear(torch.nn.Module):
...
@@ -318,7 +318,7 @@ class ColumnParallelLinear(torch.nn.Module):
input_
=
input_
.
view
(
input_shape
[
0
]
*
input_shape
[
1
],
input_shape
[
2
])
input_
=
input_
.
view
(
input_shape
[
0
]
*
input_shape
[
1
],
input_shape
[
2
])
# Maxtrix multiply with asynchronouse all-reduce execution
# Maxtrix multiply with asynchronouse all-reduce execution
output_parallel
=
ColumnParallelLinearWithAsyncAllreduce
.
apply
(
output_parallel
=
ColumnParallelLinearWithAsyncAllreduce
.
apply
(
input_
,
self
.
weight
,
bias
,
bias
is
not
None
)
input_
,
self
.
weight
,
bias
)
output_parallel
=
output_parallel
.
view
(
output_parallel
=
output_parallel
.
view
(
input_shape
[
0
],
input_shape
[
1
],
output_parallel
.
shape
[
1
])
input_shape
[
0
],
input_shape
[
1
],
output_parallel
.
shape
[
1
])
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment