[feat] support eval in mevo (#884)

- During eval, we will fallback to just output projection without fusing - added unit test to ensure the shape is correct

[feat] support eval in mevo (#884)
- During eval, we will fallback to just output projection without fusing - added unit test to ensure the shape is correct
56add6d5 · Min Xu · GitHub · e6acdcc3 · 56add6d5 · 56add6d5
Unverified Commit 56add6d5 authored Dec 13, 2021 by Min Xu Committed by GitHub Dec 13, 2021
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 2 deletions

fairscale/experimental/nn/mevo.py fairscale/experimental/nn/mevo.py +9 -2

tests/experimental/nn/test_mevo.py tests/experimental/nn/test_mevo.py +11 -0

No files found.
--- a/fairscale/experimental/nn/mevo.py
+++ b/fairscale/experimental/nn/mevo.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.


-from typing import Any, Tuple
+from typing import Any, Optional, Tuple

 import torch
 from torch import nn
@@ -430,7 +430,14 @@ class MemoryEfficientVocabOutput(nn.Module):  # AKA. MEVO
        # nlprob, then sum over all tokens.
        return -prob.sum()

-    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:  # type: ignore
+    def eval_forward(self, input: torch.Tensor) -> torch.Tensor:
+        """Eval time forward that doesn't fuse the softmax and NLL Loss kernels."""
+        return torch.matmul(input, self.proj_weight.T)
+
+    def forward(self, input: torch.Tensor, target: Optional[torch.Tensor]) -> torch.Tensor:  # type: ignore
+        if not self.training and target is None:
+            return self.eval_forward(input)
+
        if DEBUG and dist.is_initialized() and dist.get_rank() == 0:
            cur_mem = round(torch.cuda.memory_allocated() / 1024 / 1024)
            mem = round(torch.cuda.max_memory_allocated() / 1024 / 1024)

--- a/tests/experimental/nn/test_mevo.py
+++ b/tests/experimental/nn/test_mevo.py
@@ -27,6 +27,17 @@ _dense_out = {}  # type: ignore
 _dense_grad = {}  # type: ignore


+@skip_if_no_cuda
+def test_mevo_eval():
+    """Test eval mode without target tensor"""
+    weight = torch.nn.Linear(3, 4).cuda().weight
+    input = torch.rand(1, 5, 3).cuda()
+    k = MEVO(weight)
+    k.eval()
+    out = k(input, None)
+    assert out.shape == (1, 5, 4)
+
+
 @skip_if_no_cuda
 def test_mevo():
    """Test the MEVO kernel by itself."""