"vscode:/vscode.git/clone" did not exist on "2a8339dbc853ef85616c43a716e921526531679e"
Unverified Commit b56c8043 authored by Rick Ho's avatar Rick Ho Committed by GitHub
Browse files

Merge pull request #7 from laekov/reproducibility

Reproducibility
parents 03b2a725 b44de4cd
......@@ -380,8 +380,8 @@ from fmoe import FMoETransformerMLP
class CustomizedMoEPositionwiseFF(FMoETransformerMLP):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, moe_num_expert=64, moe_top_k=2):
activation = nn.Sequential(
nn.Dropout(dropout),
nn.ReLU()
nn.ReLU(),
nn.Dropout(dropout)
)
super().__init__(num_expert=moe_num_expert, d_model=d_model, d_hidden=d_inner, top_k=moe_top_k,
do_lnorm=True, pre_lnorm=pre_lnorm, activation=activation, dropout=dropout)
......
......@@ -61,8 +61,22 @@ class FMoELinear(nn.Module):
'''
x = MOELinear.apply(inp, self.weight, fwd_expert_count)
if self.bias is not None:
bias = torch.repeat_interleave(self.bias,
fwd_expert_count.to(self.bias.device), dim=0)
# TODO: torch.repeat_interleave seems have wrong
# behaviors in backward, leading to incorrect
# gradient computation for bias.
# Thus we use a for-loop to manually expand the bias.
# This part should finally goes to MOELinear.apply.
# bias = torch.repeat_interleave(self.bias,
# fwd_expert_count.to(self.bias.device), dim=0)
bias = []
for i in range(self.num_expert):
if fwd_expert_count[i] > 0:
bias.append(
self.bias[i].unsqueeze(0).expand(
fwd_expert_count[i], -1
)
)
bias = torch.cat(bias, dim=0)
x = x + bias
return x
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment