megatron support for specific expert parallelism

b97483a4 · Rick Ho · 8f67b530 · b97483a4 · b97483a4
Commit b97483a4 authored Feb 23, 2021 by Rick Ho
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 3 deletions

fmoe/layers.py fmoe/layers.py +2 -2

fmoe/megatron.py fmoe/megatron.py +2 -1

No files found.
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -47,7 +47,7 @@ class FMoELinear(nn.Module):
        device = self.weight.device
        dtype = self.weight.dtype
        weight = rng.uniform(-bound, bound, size=tuple(self.weight.size()))
-        self.weight.data = torch.tensor(weight, dtype=dtype, device=device)
+        self.weight.data = torch.Tensor(weight, dtype=dtype, device=device)
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[0])
@@ -143,7 +143,7 @@ class FMoE(nn.Module):
        self.top_k = top_k
        self.gate = gate(d_model, num_expert, world_size, top_k)
        if expert is not None:
-            self.experts = nn.ModuleList([expert(d_model) 
+            self.experts = nn.ModuleList([expert(d_model)
                for _ in range(num_expert)])
            self.experts_fused = False
        else:

--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
@@ -26,7 +26,8 @@ class MegatronMLP(FMoETransformerMLP):
        super().__init__(args.num_experts,
                top_k=args.top_k,
                d_model=args.hidden_size, d_hidden=args.hidden_hidden_size,
-                world_size=world_size, mp_group=group)
+                world_size=world_size, mp_group=group,
+                expert_dp_comm='none' if args.distributed_experts else 'dp')
        self.bias = torch.nn.parameter.Parameter(
            torch.zeros(args.hidden_size, dtype=torch.float32)
        )