Merge pull request #7 from laekov/reproducibility

Reproducibility

Merge pull request #7 from laekov/reproducibility
Reproducibility
b56c8043 · Rick Ho · GitHub · 03b2a725 · b44de4cd · b56c8043
Unverified Commit b56c8043 authored Feb 25, 2021 by Rick Ho Committed by GitHub Feb 25, 2021
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 4 deletions

examples/transformer-xl/mem_transformer.py examples/transformer-xl/mem_transformer.py +2 -2

fmoe/layers.py fmoe/layers.py +16 -2

No files found.
--- a/examples/transformer-xl/mem_transformer.py
+++ b/examples/transformer-xl/mem_transformer.py
@@ -380,8 +380,8 @@ from fmoe import FMoETransformerMLP
 class CustomizedMoEPositionwiseFF(FMoETransformerMLP):
    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, moe_num_expert=64, moe_top_k=2):
        activation = nn.Sequential(
-            nn.Dropout(dropout),
-            nn.ReLU()
+            nn.ReLU(),
+            nn.Dropout(dropout)
        )
        super().__init__(num_expert=moe_num_expert, d_model=d_model, d_hidden=d_inner, top_k=moe_top_k,
                do_lnorm=True, pre_lnorm=pre_lnorm, activation=activation, dropout=dropout)

--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -61,8 +61,22 @@ class FMoELinear(nn.Module):
        '''
        x = MOELinear.apply(inp, self.weight, fwd_expert_count)
        if self.bias is not None:
-            bias = torch.repeat_interleave(self.bias,
-                    fwd_expert_count.to(self.bias.device), dim=0)
+            # TODO: torch.repeat_interleave seems have wrong
+            # behaviors in backward, leading to incorrect
+            # gradient computation for bias.
+            # Thus we use a for-loop to manually expand the bias.
+            # This part should finally goes to MOELinear.apply.
+            # bias = torch.repeat_interleave(self.bias,
+            #        fwd_expert_count.to(self.bias.device), dim=0)
+            bias = []
+            for i in range(self.num_expert):
+                if fwd_expert_count[i] > 0:
+                    bias.append(
+                        self.bias[i].unsqueeze(0).expand(
+                            fwd_expert_count[i], -1
+                        )
+                    )
+            bias = torch.cat(bias, dim=0)
            x = x + bias
        return x