use parallel label in gate

d83234b0 · Rick Ho · 67c667f2 · d83234b0 · d83234b0 · d83234b0
Commit d83234b0 authored Feb 04, 2021 by Rick Ho
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 9 deletions

fmoe/distributed.py fmoe/distributed.py +8 -8

fmoe/layers.py fmoe/layers.py +2 -0

fmoe/megatron.py fmoe/megatron.py +2 -0

setup.py setup.py +1 -1

No files found.
--- a/fmoe/distributed.py
+++ b/fmoe/distributed.py
@@ -29,20 +29,20 @@ class DistributedGroupedDataParallel(nn.Module):
            for p in self.module.parameters():
                if not p.requires_grad or p.grad is None:
                    continue
-                if hasattr(p, 'parallel_method'):
+                if hasattr(p, 'dp_comm'):
-                    pm = p.parallel_method
+                    dp_comm = p.dp_comm
                else:
-                    pm = 'dp'
+                    dp_comm = 'dp'
-                group_key = (pm, p.dtype)
+                group_key = (dp_comm, p.dtype)
                if group_key not in groups:
                    groups[group_key] = [p]
                else:
                    groups[group_key].append(p)
-            for pm, dtype in groups:
+            for dp_comm, dtype in groups:
-                if pm not in self.comms:
+                if dp_comm not in self.comms:
                    continue
-                group = groups[pm, dtype]
+                group = groups[dp_comm, dtype]
-                comm = self.comms[pm]
+                comm = self.comms[dp_comm]
                grads = [p.grad.data for p in group]
                coalesced = _flatten_dense_tensors(grads)
                if fp32_allreduce and dtype != torch.float32:

--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -92,6 +92,8 @@ class FMoETransformerMLP(nn.Module):
        self.h4toh = FMoELinear(num_expert, d_hidden, d_model)
        self.gate = FMoENaiveGate(d_model, num_expert, world_size, top_k)
+        for p in self.gate.parameters():
+            setattr(p, 'dp_comm', 'world')
        self.layer_norm = nn.LayerNorm(d_model)
        self.bias = torch.nn.parameter.Parameter(

--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
@@ -18,6 +18,8 @@ def create_moe_mlp(args, model_parallel_rank, group):
        model_parallel_rank=model_parallel_rank,
        mp_group=group,
    )
+    for p in fmoe.gate.parameters():
+        setattr(p, 'shared', True)
    return fmoe

--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ if __name__ == '__main__':
                    }
                )
            ],
-        version='0.0.1',
+        version='0.0.2',
        cmdclass={
            'build_ext': BuildExtension
        })