Commit c65039da authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

propose and discuss 3 solutions to expand bias

parent 63f6ebbf
...@@ -61,22 +61,33 @@ class FMoELinear(nn.Module): ...@@ -61,22 +61,33 @@ class FMoELinear(nn.Module):
''' '''
x = MOELinear.apply(inp, self.weight, fwd_expert_count) x = MOELinear.apply(inp, self.weight, fwd_expert_count)
if self.bias is not None: if self.bias is not None:
# TODO: torch.repeat_interleave seems have wrong # TODO: torch.repeat_interleave seems have numerical
# behaviors in backward, leading to incorrect # instability in backward, leading to incorrect
# gradient computation for bias. # gradient computation for solution 1 and 2.
# Thus we use a for-loop to manually expand the bias. # Solution 3 uses a for-loop to expand the bias,
# This part should finally goes to MOELinear.apply. # but is 50% slower.
# This part should finally goes to MOELinear.apply,
# like MOELinear.apply(x, weight, bias, count)
# Solution 1
# bias = torch.repeat_interleave(self.bias, # bias = torch.repeat_interleave(self.bias,
# fwd_expert_count.to(self.bias.device), dim=0) # fwd_expert_count.to(self.bias.device), dim=0)
bias = []
for i in range(self.num_expert): # Solution 2
if fwd_expert_count[i] > 0: bias_idx = torch.arange(self.num_expert)\
bias.append( .repeat_interleave(fwd_expert_count)
self.bias[i].unsqueeze(0).expand( bias = self.bias[bias_idx]
fwd_expert_count[i], -1
) # Solution 3
) # bias = []
bias = torch.cat(bias, dim=0) # for i in range(self.num_expert):
# if fwd_expert_count[i] > 0:
# bias.append(
# self.bias[i].unsqueeze(0).expand(
# fwd_expert_count[i], -1
# )
# )
# bias = torch.cat(bias, dim=0)
x = x + bias x = x + bias
return x return x
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment