[moe] optim: reduce memory consumption in fused_moe (#3692)

6b0aeb58 · Cheng Wan · GitHub · bb3e5268 · 6b0aeb58
Unverified Commit 6b0aeb58 authored Feb 19, 2025 by Cheng Wan Committed by GitHub Feb 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 5 deletions

python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -5

No files found.
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -933,20 +933,21 @@ def fused_experts_impl(

    config = get_config_func(M)

-    intermediate_cache1 = torch.empty(
-        (M, topk_ids.shape[1], N),
+    cache = torch.empty(
+        M * topk_ids.shape[1] * max(N, w2.shape[1]),
        device=hidden_states.device,
        dtype=hidden_states.dtype,
    )
+    intermediate_cache1 = cache[: M * topk_ids.shape[1] * N].view(
+        (M, topk_ids.shape[1], N),
+    )
    intermediate_cache2 = torch.empty(
        (M * topk_ids.shape[1], N // 2),
        device=hidden_states.device,
        dtype=hidden_states.dtype,
    )
-    intermediate_cache3 = torch.empty(
+    intermediate_cache3 = cache[: M * topk_ids.shape[1] * w2.shape[1]].view(
        (M, topk_ids.shape[1], w2.shape[1]),
-        device=hidden_states.device,
-        dtype=hidden_states.dtype,
    )

    compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16