[Kernel] Optimize moe intermediate_cache usage (#13625)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Kernel] Optimize moe intermediate_cache usage (#13625)
Signed-off-by: mgoin <mgoin64@gmail.com>
19d98e0c · Michael Goin · GitHub · 2b04c209 · 19d98e0c
Unverified Commit 19d98e0c authored Mar 03, 2025 by Michael Goin Committed by GitHub Mar 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 6 deletions

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +11 -6

No files found.
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1240,15 +1240,20 @@ def fused_experts_impl(hidden_states: torch.Tensor,
    config = get_config_func(M)
-    intermediate_cache1 = torch.empty((M, top_k_num, N),
+    # We can reuse the memory between these because by the time we need
-                                      device=hidden_states.device,
+    # cache3, we're done with cache1
-                                      dtype=hidden_states.dtype)
+    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(
+        (M, topk_ids.shape[1], N))
+    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
+        (M, topk_ids.shape[1], w2.shape[1]))
+    # This needs separate memory since it's used concurrently with cache1
    intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
                                      device=hidden_states.device,
                                      dtype=hidden_states.dtype)
-    intermediate_cache3 = torch.empty((M, top_k_num, w2.shape[1]),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
    if hidden_states.dtype == torch.bfloat16:
        compute_type = tl.bfloat16