Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
63f6ebbf
Commit
63f6ebbf
authored
Feb 26, 2021
by
Jiezhong Qiu
Browse files
return a zero bias wo grad for megatron
the true bias has been added in FMoeLinear
parent
fe2009b1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
3 additions
and
7 deletions
+3
-7
fmoe/megatron.py
fmoe/megatron.py
+3
-7
No files found.
fmoe/megatron.py
View file @
63f6ebbf
...
@@ -3,8 +3,6 @@ The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
...
@@ -3,8 +3,6 @@ The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
lines of modification.
lines of modification.
See `examples/megatron` for usage instructions.
See `examples/megatron` for usage instructions.
'''
'''
import
torch
from
.transformer
import
FMoETransformerMLP
from
.transformer
import
FMoETransformerMLP
from
.distributed
import
DistributedGroupedDataParallel
from
.distributed
import
DistributedGroupedDataParallel
from
.utils
import
get_torch_default_comm
from
.utils
import
get_torch_default_comm
...
@@ -28,12 +26,10 @@ class MegatronMLP(FMoETransformerMLP):
...
@@ -28,12 +26,10 @@ class MegatronMLP(FMoETransformerMLP):
d_model
=
args
.
hidden_size
,
d_hidden
=
args
.
hidden_hidden_size
,
d_model
=
args
.
hidden_size
,
d_hidden
=
args
.
hidden_hidden_size
,
world_size
=
world_size
,
mp_group
=
group
,
world_size
=
world_size
,
mp_group
=
group
,
expert_dp_comm
=
'none'
if
args
.
distributed_experts
else
'dp'
)
expert_dp_comm
=
'none'
if
args
.
distributed_experts
else
'dp'
)
self
.
bias
=
torch
.
nn
.
parameter
.
Parameter
(
torch
.
zeros
(
args
.
hidden_size
,
dtype
=
torch
.
float32
)
)
def
forward
(
self
,
inp
):
def
forward
(
self
,
inp
):
return
super
().
forward
(
inp
),
self
.
bias
output
=
super
().
forward
(
inp
)
bias
=
output
.
new_zeros
(
output
.
size
(
-
1
),
requires_grad
=
False
)
return
output
,
bias
def
fmoefy
(
model
,
num_experts
=
None
,
distributed_experts
=
True
,
def
fmoefy
(
model
,
num_experts
=
None
,
distributed_experts
=
True
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment