[feat] moe: annotate expert params (#140)

The expert annotation is used by clip_grads and DDP.

[feat] moe: annotate expert params (#140)
The expert annotation is used by clip_grads and DDP.
ee88bb19 · msbaines · GitHub · d99c445a · ee88bb19 · ee88bb19
Unverified Commit ee88bb19 authored Oct 16, 2020 by msbaines Committed by GitHub Oct 16, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

fairscale/nn/moe/moelayer.py fairscale/nn/moe/moelayer.py +2 -0

tests/nn/moe/test_moelayer.py tests/nn/moe/test_moelayer.py +11 -0

No files found.
--- a/fairscale/nn/moe/moelayer.py
+++ b/fairscale/nn/moe/moelayer.py
@@ -60,6 +60,8 @@ class MOELayer(Base):
        self.gate = gate
        self.expert = expert
        self.group = group if group is not None else dist.group.WORLD
+        for p in expert.parameters():
+            p.expert = True  # type: ignore

    def all_to_all_dispatch(self, dispatch_mask: Tensor, input: Tensor) -> Tensor:
        dispatched_input = torch.einsum("gsec,gsm->egcm", dispatch_mask.float(), input)

--- a/tests/nn/moe/test_moelayer.py
+++ b/tests/nn/moe/test_moelayer.py
@@ -45,6 +45,17 @@ def test_create(device):
    moe = MOELayer(gate, expert).to(device)


+@pytest.mark.parametrize("device", devices)
+def test_expert_params(device):
+    model_dim = 8
+    num_experts = 4
+    gate = Top2Gate(model_dim, num_experts)
+    expert = torch.nn.Linear(model_dim, model_dim)
+    moe = MOELayer(gate, expert).to(device)
+    for p in expert.parameters():
+        assert p.expert is True
+
+
 @pytest.mark.mpi
 @pytest.mark.parametrize("device", ["cpu"])
 def test_forward(device):