Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4ae17bf1
Unverified
Commit
4ae17bf1
authored
Mar 27, 2025
by
Wes
Committed by
GitHub
Mar 27, 2025
Browse files
Revert "Use Cache Hinting for fused_moe kernel (#15511)" (#15645)
Signed-off-by:
Wes Medford
<
wryanmedford@gmail.com
>
parent
8a49eea7
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
12 deletions
+4
-12
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+4
-12
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
4ae17bf1
...
...
@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
mask
=
token_mask
[:,
None
]
&
(
offs_k
[
None
,
:]
<
K
-
k
*
BLOCK_SIZE_K
),
other
=
0.0
)
b
=
tl
.
load
(
b_ptrs
,
cache_modifier
=
".cg"
,
eviction_policy
=
"evict_last"
,
)
b
=
tl
.
load
(
b_ptrs
)
if
use_int4_w4a16
:
b
=
(
b
>>
b_shifter
)
&
0xF
...
...
@@ -395,13 +391,9 @@ def fused_moe_kernel(
mask
=
token_mask
[:,
None
]
&
(
offs_k
[
None
,
:]
<
K
-
k
*
BLOCK_SIZE_K
),
other
=
0.0
)
b
=
tl
.
load
(
b_ptrs
,
b
=
tl
.
load
(
b_ptrs
,
mask
=
offs_k
[:,
None
]
<
K
-
k
*
BLOCK_SIZE_K
,
other
=
0.0
,
cache_modifier
=
".cg"
,
eviction_policy
=
"evict_last"
,
)
other
=
0.0
)
# We accumulate along the K dimension.
if
use_int8_w8a16
:
accumulator
=
tl
.
dot
(
a
,
b
.
to
(
compute_type
),
acc
=
accumulator
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment