Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
59556265
Unverified
Commit
59556265
authored
Apr 20, 2026
by
Yan Ma
Committed by
GitHub
Apr 20, 2026
Browse files
[XPU] fix MoE triton backend in online fp8 quantization (#40109)
Signed-off-by:
Yan Ma
<
yan.ma@intel.com
>
parent
3a30eaa1
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
13 additions
and
4 deletions
+13
-4
vllm/model_executor/layers/fused_moe/oracle/fp8.py
vllm/model_executor/layers/fused_moe/oracle/fp8.py
+6
-0
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
+7
-0
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+0
-4
No files found.
vllm/model_executor/layers/fused_moe/oracle/fp8.py
View file @
59556265
...
...
@@ -471,6 +471,12 @@ def convert_to_fp8_moe_kernel_format(
w2_input_scale
=
w2_input_scale
,
is_trtllm
=
(
fp8_backend
==
Fp8MoeBackend
.
FLASHINFER_TRTLLM
),
)
elif
fp8_backend
==
Fp8MoeBackend
.
XPU
:
from
vllm.model_executor.layers.fused_moe.xpu_fused_moe
import
(
prepare_fp8_moe_layer_for_xpu
,
)
w13
,
w2
=
prepare_fp8_moe_layer_for_xpu
(
w13
,
w2
)
else
:
if
fp8_backend
not
in
[
Fp8MoeBackend
.
TRITON
,
...
...
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
View file @
59556265
...
...
@@ -24,6 +24,13 @@ if current_platform.is_xpu():
from
vllm_xpu_kernels.fused_moe_interface
import
xpu_fused_moe
def
prepare_fp8_moe_layer_for_xpu
(
w13
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
return
w13
.
transpose
(
-
1
,
-
2
).
contiguous
(),
w2
.
transpose
(
-
1
,
-
2
).
contiguous
()
class
XPUExperts
(
mk
.
FusedMoEExpertsModular
):
def
__init__
(
self
,
...
...
vllm/model_executor/layers/quantization/fp8.py
View file @
59556265
...
...
@@ -1019,10 +1019,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
layer
.
w2_weight
[
expert
,
:,
:]
)
if
current_platform
.
is_xpu
():
w13
.
data
=
w13
.
transpose
(
-
1
,
-
2
).
contiguous
()
w2
.
data
=
w2
.
transpose
(
-
1
,
-
2
).
contiguous
()
# Shuffle weights to runtime format and setup kernel.
self
.
_setup_kernel
(
layer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment