add TODO for bias issue

b44de4cd · Jiezhong Qiu · 08e08319 · b44de4cd
Commit b44de4cd authored Feb 25, 2021 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 5 deletions

fmoe/layers.py fmoe/layers.py +10 -5

No files found.
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -61,16 +61,21 @@ class FMoELinear(nn.Module):
        '''
        x = MOELinear.apply(inp, self.weight, fwd_expert_count)
        if self.bias is not None:
-            # TODO: torch.repeat_interleave seems have wrong behavior 
+            # TODO: torch.repeat_interleave seems have wrong
-            # in backward, leading to incorrect gradient for bias.
+            # behaviors in backward, leading to incorrect
-            # Thus we use a for-loop to manually expand the bias term.
+            # gradient computation for bias.
+            # Thus we use a for-loop to manually expand the bias.
            # This part should finally goes to MOELinear.apply.
-            #bias = torch.repeat_interleave(self.bias,
+            # bias = torch.repeat_interleave(self.bias,
            #        fwd_expert_count.to(self.bias.device), dim=0)
            bias = []
            for i in range(self.num_expert):
                if fwd_expert_count[i] > 0:
-                    bias.append(self.bias[i].unsqueeze(0).expand(fwd_expert_count[i], -1))
+                    bias.append(
+                        self.bias[i].unsqueeze(0).expand(
+                            fwd_expert_count[i], -1
+                        )
+                    )
            bias = torch.cat(bias, dim=0)
            x = x + bias
        return x