Unverified Commit 4ae17bf1 authored by Wes's avatar Wes Committed by GitHub
Browse files

Revert "Use Cache Hinting for fused_moe kernel (#15511)" (#15645)


Signed-off-by: default avatarWes Medford <wryanmedford@gmail.com>
parent 8a49eea7
...@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq( ...@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
mask=token_mask[:, None] & mask=token_mask[:, None] &
(offs_k[None, :] < K - k * BLOCK_SIZE_K), (offs_k[None, :] < K - k * BLOCK_SIZE_K),
other=0.0) other=0.0)
b = tl.load( b = tl.load(b_ptrs)
b_ptrs,
cache_modifier=".cg",
eviction_policy="evict_last",
)
if use_int4_w4a16: if use_int4_w4a16:
b = (b >> b_shifter) & 0xF b = (b >> b_shifter) & 0xF
...@@ -395,13 +391,9 @@ def fused_moe_kernel( ...@@ -395,13 +391,9 @@ def fused_moe_kernel(
mask=token_mask[:, None] & mask=token_mask[:, None] &
(offs_k[None, :] < K - k * BLOCK_SIZE_K), (offs_k[None, :] < K - k * BLOCK_SIZE_K),
other=0.0) other=0.0)
b = tl.load( b = tl.load(b_ptrs,
b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
other=0.0,
cache_modifier=".cg",
eviction_policy="evict_last",
)
# We accumulate along the K dimension. # We accumulate along the K dimension.
if use_int8_w8a16: if use_int8_w8a16:
accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment