[Misc] Fix expert_ids shape in MoE (#4517)

826b82a2 · Woosuk Kwon · GitHub · c9d852d6 · 826b82a2
Unverified Commit 826b82a2 authored May 01, 2024 by Woosuk Kwon Committed by GitHub May 01, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 5 deletions

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +6 -5

No files found.
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -203,14 +203,15 @@ def moe_align_block_size(
    - The padding ensures that the total number of tokens is now divisible
        by block_size for proper block matrix operations.
    """
-    sorted_ids = torch.empty(
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-        (topk_ids.numel() + num_experts * (block_size - 1), ),
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
-        dtype=torch.int32,
-        device=topk_ids.device)
-    expert_ids = torch.empty((topk_ids.numel() + num_experts, ),
                             dtype=torch.int32,
                             device=topk_ids.device)
    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
    num_tokens_post_pad = torch.empty((1),
                                      dtype=torch.int32,
                                      device=topk_ids.device)