[Kernel] Raise an exception in MoE kernel if the batch size is larger then 65k (#5939)

f7dac83d · Cody Yu · GitHub · 7c01f706 · f7dac83d
Unverified Commit f7dac83d authored Jun 29, 2024 by Cody Yu Committed by GitHub Jun 29, 2024
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +5 -0

No files found.
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -423,6 +423,11 @@ def fused_experts(hidden_states: torch.Tensor,
    M, _ = hidden_states.shape
    E, N, _ = w1.shape

+    if M > 65536:
+        # https://github.com/vllm-project/vllm/issues/5938
+        raise ValueError("MoE kernel does not support more than 65536 tokens, "
+                         f"but got {M}")
+
    if override_config:
        config = override_config
    else: