propose and discuss 3 solutions to expand bias

c65039da · Jiezhong Qiu · 63f6ebbf · c65039da
Commit c65039da authored Feb 26, 2021 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 15 deletions

fmoe/layers.py fmoe/layers.py +26 -15

No files found.
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -61,28 +61,39 @@ class FMoELinear(nn.Module):
        '''
        x = MOELinear.apply(inp, self.weight, fwd_expert_count)
        if self.bias is not None:
-            # TODO: torch.repeat_interleave seems have wrong
-            # behaviors in backward, leading to incorrect
-            # gradient computation for bias.
-            # Thus we use a for-loop to manually expand the bias.
-            # This part should finally goes to MOELinear.apply.
+            # TODO: torch.repeat_interleave seems have numerical
+            # instability in backward, leading to incorrect
+            # gradient computation for solution 1 and 2.
+            # Solution 3 uses a for-loop to expand the bias,
+            # but is 50% slower.
+            # This part should finally goes to MOELinear.apply,
+            # like MOELinear.apply(x, weight, bias, count)
+
+            # Solution 1
            # bias = torch.repeat_interleave(self.bias,
            #        fwd_expert_count.to(self.bias.device), dim=0)
-            bias = []
-            for i in range(self.num_expert):
-                if fwd_expert_count[i] > 0:
-                    bias.append(
-                        self.bias[i].unsqueeze(0).expand(
-                            fwd_expert_count[i], -1
-                        )
-                    )
-            bias = torch.cat(bias, dim=0)
+
+            # Solution 2
+            bias_idx = torch.arange(self.num_expert)\
+                .repeat_interleave(fwd_expert_count)
+            bias = self.bias[bias_idx]
+
+            # Solution 3
+            # bias = []
+            # for i in range(self.num_expert):
+            #    if fwd_expert_count[i] > 0:
+            #        bias.append(
+            #            self.bias[i].unsqueeze(0).expand(
+            #                fwd_expert_count[i], -1
+            #            )
+            #        )
+            # bias = torch.cat(bias, dim=0)
            x = x + bias
        return x

    def extra_repr(self) -> str:
        return 'num_expert={}, in_features={}, \
-                out_features={}, bias={}, rank={}'.format(
+        out_features={}, bias={}, rank={}'.format(
                    self.num_expert, self.in_feat,
                    self.out_feat, self.bias is not None, self.rank
        )