sync in the whole world instead of mp world in megatron

d6e7a429 · Rick Ho · f2040d9f · d6e7a429 · d6e7a429
Commit d6e7a429 authored Feb 05, 2021 by Rick Ho
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 3 deletions

fmoe/layers.py fmoe/layers.py +6 -0

fmoe/megatron.py fmoe/megatron.py +6 -3

No files found.
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -148,6 +148,12 @@ class FMoETransformerMLP(nn.Module):
        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden)
        self.h4toh = FMoELinear(num_expert, d_hidden, d_model)

+        if self.world_size > self.mp_size:
+            for p in self.htoh4.parameters():
+                setattr(p, 'dp_comm', 'none')
+            for p in self.h4toh.parameters():
+                setattr(p, 'dp_comm', 'none')
+
        self.gate = FMoENaiveGate(d_model, num_expert, world_size, top_k)
        for p in self.gate.parameters():
            setattr(p, 'dp_comm', 'world')

--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
-'''
+r'''
 The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
 lines of modification.
 See `exapmles/megatron` for usage instructions.
 '''
 from .layers import FMoETransformerMLP
 from .distributed import DistributedGroupedDataParallel
+from .utils import get_torch_default_comm


 def _create_moe_mlp(args, group):
@@ -42,9 +43,11 @@ def fmoefy(model, num_experts=None, distributed_experts=True):
    they are trained in data-parallel mode. This can be useful when testing on
    small models that do not require high training throughput or large parameter
    capacity.
+    Note that pipeline parallel is not supported yet. When distributed experts
+    are enabled, their communicator should be Megatron's
+    tensor_model_parall_comm x data_parallel_comm, which is not created.
    '''
    from megatron import get_args
-    from megatron import mpu
    args = get_args()
    if num_experts is not None:
        args.num_experts = num_experts
@@ -57,7 +60,7 @@ def fmoefy(model, num_experts=None, distributed_experts=True):
        args.distributed_experts = distributed_experts

    for l in model.language_model.transformer.layers:
-        l.mlp = _create_moe_mlp(args, mpu.get_model_parallel_group())
+        l.mlp = _create_moe_mlp(args, get_torch_default_comm())
    return model