Commit 63f6ebbf authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

return a zero bias wo grad for megatron

the true bias has been added in FMoeLinear
parent fe2009b1
...@@ -3,8 +3,6 @@ The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two ...@@ -3,8 +3,6 @@ The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
lines of modification. lines of modification.
See `examples/megatron` for usage instructions. See `examples/megatron` for usage instructions.
''' '''
import torch
from .transformer import FMoETransformerMLP from .transformer import FMoETransformerMLP
from .distributed import DistributedGroupedDataParallel from .distributed import DistributedGroupedDataParallel
from .utils import get_torch_default_comm from .utils import get_torch_default_comm
...@@ -28,12 +26,10 @@ class MegatronMLP(FMoETransformerMLP): ...@@ -28,12 +26,10 @@ class MegatronMLP(FMoETransformerMLP):
d_model=args.hidden_size, d_hidden=args.hidden_hidden_size, d_model=args.hidden_size, d_hidden=args.hidden_hidden_size,
world_size=world_size, mp_group=group, world_size=world_size, mp_group=group,
expert_dp_comm='none' if args.distributed_experts else 'dp') expert_dp_comm='none' if args.distributed_experts else 'dp')
self.bias = torch.nn.parameter.Parameter(
torch.zeros(args.hidden_size, dtype=torch.float32)
)
def forward(self, inp): def forward(self, inp):
return super().forward(inp), self.bias output = super().forward(inp)
bias = output.new_zeros(output.size(-1), requires_grad=False)
return output, bias
def fmoefy(model, num_experts=None, distributed_experts=True, def fmoefy(model, num_experts=None, distributed_experts=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment