Commit 1f48a6b3 authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

fix layernorm and remove unnecessary bias

parent fa023f32
...@@ -824,16 +824,12 @@ class CustomizedMoEPositionwiseFF(FMoETransformerMLP): ...@@ -824,16 +824,12 @@ class CustomizedMoEPositionwiseFF(FMoETransformerMLP):
def activation(x): def activation(x):
return self.dropout(F.relu(x)) return self.dropout(F.relu(x))
super().__init__(num_expert=moe_num_expert, d_model=d_model, d_hidden=d_inner, top_k=moe_top_k, super().__init__(num_expert=moe_num_expert, d_model=d_model, d_hidden=d_inner, top_k=moe_top_k,
pre_lnorm=pre_lnorm, activation=activation) do_lnorm=True, pre_lnorm=pre_lnorm, activation=activation)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.bias = nn.Parameter(
torch.zeros(d_model, dtype=torch.float32)
)
def forward(self, x): def forward(self, x):
x = super().forward(x) x = super().forward(x)
return x + self.bias return x
class DecoderLayer(nn.Module): class DecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs): def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment