Commit 97a2b1be authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

can train now

parent 3be6ac10
...@@ -47,8 +47,8 @@ class CustomizedMoEPositionwiseFF(nn.Module): ...@@ -47,8 +47,8 @@ class CustomizedMoEPositionwiseFF(nn.Module):
self.gate = nn.Linear(d_model, num_expert) self.gate = nn.Linear(d_model, num_expert)
self.moe1 = MOELayer(num_expert=num_expert, in_feat=d_model, out_feat=d_inner) self.moe1 = MOELayer(num_expert=num_expert, in_feat=d_model+1, out_feat=d_inner)
self.moe2 = MOELayer(num_expert=num_expert, in_feat=d_inner, out_feat=d_model) self.moe2 = MOELayer(num_expert=num_expert, in_feat=d_inner+1, out_feat=d_model)
self.layer_norm = nn.LayerNorm(d_model) self.layer_norm = nn.LayerNorm(d_model)
...@@ -76,15 +76,13 @@ class CustomizedMoEPositionwiseFF(nn.Module): ...@@ -76,15 +76,13 @@ class CustomizedMoEPositionwiseFF(nn.Module):
core_out = [] core_out = []
inp = inp.view(-1, self.d_model) inp = inp.view(-1, self.d_model)
# inp = F.pad(inp, pad=(0, 1), mode='constant', value=1.0) inp = F.pad(inp, pad=(0, 1), mode='constant', value=1.0)
for i in range(self.top_k): for i in range(self.top_k):
print("top %d" % i)
gate_idx = gate_top_k_idx[:, i].contiguous() gate_idx = gate_top_k_idx[:, i].contiguous()
print(inp.size(), gate_idx.size(), inp.device, gate_idx.device)
x = self.moe1(inp, gate_idx) x = self.moe1(inp, gate_idx)
x = self.dropout(F.relu(x)) x = self.dropout(F.relu(x))
# x = F.pad(x, pad=(0, 1), mode='constant', value=1.0) x = F.pad(x, pad=(0, 1), mode='constant', value=1.0)
x = self.moe2(x, gate_idx) x = self.moe2(x, gate_idx)
x = self.dropout(x) # (BxL) x d_model x = self.dropout(x) # (BxL) x d_model
core_out.append(x.unsqueeze(1)) # (BxL) x 1 x d_model core_out.append(x.unsqueeze(1)) # (BxL) x 1 x d_model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment