[BUG] fix moe benchmark when bs*seq is small (#3382)

64480df4 · yiakwy-xpu-ml-framework-team · GitHub · 4530136e · 64480df4
Unverified Commit 64480df4 authored Feb 08, 2025 by yiakwy-xpu-ml-framework-team Committed by GitHub Feb 08, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py ...fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py +2 -2

No files found.
--- a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
@@ -157,7 +157,7 @@ def calculate_diff(batch_size, seq_len):
    )
    sorted_ids_cuda.fill_(topk_ids.numel())
    max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids_cuda = torch.empty(
+    expert_ids_cuda = torch.zeros(
        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
    )
    num_tokens_post_pad_cuda = torch.empty(
@@ -172,7 +172,7 @@ def calculate_diff(batch_size, seq_len):
    sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
    sorted_ids_triton.fill_(topk_ids.numel())
-    expert_ids_triton = torch.empty_like(expert_ids_cuda)
+    expert_ids_triton = torch.zeros_like(expert_ids_cuda)
    num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
    # compare the performance of cuda and triton implementation