Revert "Use Cache Hinting for fused_moe kernel (#15511)" (#15645)

Signed-off-by: Wes Medford <wryanmedford@gmail.com>

Revert "Use Cache Hinting for fused_moe kernel (#15511)" (#15645)
Signed-off-by: Wes Medford <wryanmedford@gmail.com>
4ae17bf1 · Wes · GitHub · 8a49eea7 · 4ae17bf1
Unverified Commit 4ae17bf1 authored Mar 27, 2025 by Wes Committed by GitHub Mar 27, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 12 deletions

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +4 -12

No files found.
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
                    mask=token_mask[:, None] &
                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                    other=0.0)
-        b = tl.load(
+        b = tl.load(b_ptrs)
-            b_ptrs,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
        if use_int4_w4a16:
            b = (b >> b_shifter) & 0xF
@@ -395,13 +391,9 @@ def fused_moe_kernel(
                    mask=token_mask[:, None] &
                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                    other=0.0)
-        b = tl.load(
+        b = tl.load(b_ptrs,
-            b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-            mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
-            other=0.0,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
        # We accumulate along the K dimension.
        if use_int8_w8a16:
            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)