can train now

97a2b1be · Jiezhong Qiu · 3be6ac10 · 97a2b1be
Commit 97a2b1be authored Jan 03, 2021 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 6 deletions

pytorch/mem_transformer.py pytorch/mem_transformer.py +4 -6

No files found.
--- a/pytorch/mem_transformer.py
+++ b/pytorch/mem_transformer.py
@@ -47,8 +47,8 @@ class CustomizedMoEPositionwiseFF(nn.Module):
        self.gate = nn.Linear(d_model, num_expert)
-        self.moe1 = MOELayer(num_expert=num_expert, in_feat=d_model, out_feat=d_inner)
+        self.moe1 = MOELayer(num_expert=num_expert, in_feat=d_model+1, out_feat=d_inner)
-        self.moe2 = MOELayer(num_expert=num_expert, in_feat=d_inner, out_feat=d_model)
+        self.moe2 = MOELayer(num_expert=num_expert, in_feat=d_inner+1, out_feat=d_model)
        self.layer_norm = nn.LayerNorm(d_model)
@@ -76,15 +76,13 @@ class CustomizedMoEPositionwiseFF(nn.Module):
        core_out = []
        inp = inp.view(-1, self.d_model)
-        # inp = F.pad(inp, pad=(0, 1), mode='constant', value=1.0)
+        inp = F.pad(inp, pad=(0, 1), mode='constant', value=1.0)
        for i in range(self.top_k):
-            print("top %d" % i)
            gate_idx = gate_top_k_idx[:, i].contiguous()
-            print(inp.size(), gate_idx.size(), inp.device, gate_idx.device)
            x = self.moe1(inp, gate_idx)
            x = self.dropout(F.relu(x))
-            # x = F.pad(x, pad=(0, 1), mode='constant', value=1.0)
+            x = F.pad(x, pad=(0, 1), mode='constant', value=1.0)
            x = self.moe2(x, gate_idx)
            x = self.dropout(x) # (BxL) x d_model
            core_out.append(x.unsqueeze(1)) # (BxL) x 1 x d_model