Merge pull request #108 from laekov/faster-bug

Fix type mismatch, shape mismatch and lack of condition in FasterMoE's expert shadowing

Merge pull request #108 from laekov/faster-bug
Fix type mismatch, shape mismatch and lack of condition in FasterMoE's expert shadowing
5083a736 · Rick Ho · GitHub · 665b99bf · 4682c1d0 · 5083a736
Unverified Commit 5083a736 authored Apr 30, 2022 by Rick Ho Committed by GitHub Apr 30, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

fmoe/fastermoe/expert_utils.py fmoe/fastermoe/expert_utils.py +3 -1

fmoe/fastermoe/schedule.py fmoe/fastermoe/schedule.py +1 -1

No files found.
--- a/fmoe/fastermoe/expert_utils.py
+++ b/fmoe/fastermoe/expert_utils.py
@@ -29,6 +29,8 @@ def stash_expert_params(e, params):
 def pop_expert_params(e):
    if not hasattr(e, 'expert_param_stash'):
        return
+    if not e.expert_param_stash:
+        return
    for n, p in e.named_parameters():
        with torch.no_grad():
            p.copy_(e.expert_param_stash[n])
@@ -53,6 +55,6 @@ def set_grads(e, grads):
        seg = grads[offset:offset + p.numel()]
        offset += p.numel()
        if p.grad is None:
-            p.grad = seg.clone()
+            p.grad = seg.clone().reshape(p.shape)
        else:
            p.grad += seg.reshape(p.shape)
--- a/fmoe/fastermoe/schedule.py
+++ b/fmoe/fastermoe/schedule.py
@@ -35,7 +35,7 @@ class MoEForward(Function):
                x.requires_grad = True
                # To skip torch autograd's version check.
                with torch.autograd.graph.saved_tensors_hooks(nothing, nothing):
-                    y0 = expert_fn(x, [x.shape[0]])
+                    y0 = expert_fn(x, torch.tensor([x.shape[0]], dtype=torch.int64))
            ctx.gibs[idx] = x
            ctx.gobs[idx] = y0
            y.copy_(y0)