Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
394ff869
Unverified
Commit
394ff869
authored
Apr 12, 2026
by
Yan Ma
Committed by
GitHub
Apr 12, 2026
Browse files
[XPU][CT] support per-channel quantization in xpu fp8 linear method (#38316)
Signed-off-by:
Yan Ma
<
yan.ma@intel.com
>
parent
df1e30e7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
1 deletion
+14
-1
vllm/model_executor/kernels/linear/__init__.py
vllm/model_executor/kernels/linear/__init__.py
+1
-1
vllm/model_executor/kernels/linear/scaled_mm/xpu.py
vllm/model_executor/kernels/linear/scaled_mm/xpu.py
+13
-0
No files found.
vllm/model_executor/kernels/linear/__init__.py
View file @
394ff869
...
@@ -204,7 +204,7 @@ _POSSIBLE_WFP8A16_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]
...
@@ -204,7 +204,7 @@ _POSSIBLE_WFP8A16_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]
# To be added
# To be added
],
],
PlatformEnum
.
XPU
:
[
PlatformEnum
.
XPU
:
[
# To be added
XPUFP8ScaledMMLinearKernel
,
],
],
}
}
...
...
vllm/model_executor/kernels/linear/scaled_mm/xpu.py
View file @
394ff869
...
@@ -9,6 +9,11 @@ from vllm.model_executor.kernels.linear import ( # noqa: E501
...
@@ -9,6 +9,11 @@ from vllm.model_executor.kernels.linear import ( # noqa: E501
FP8ScaledMMLinearKernel
,
FP8ScaledMMLinearKernel
,
FP8ScaledMMLinearLayerConfig
,
FP8ScaledMMLinearLayerConfig
,
)
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
kFp8StaticChannelSym
,
kFp8StaticTensorSym
,
)
from
vllm.model_executor.utils
import
replace_parameter
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -23,6 +28,11 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
...
@@ -23,6 +28,11 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
@
classmethod
@
classmethod
def
can_implement
(
cls
,
c
:
FP8ScaledMMLinearLayerConfig
)
->
tuple
[
bool
,
str
|
None
]:
def
can_implement
(
cls
,
c
:
FP8ScaledMMLinearLayerConfig
)
->
tuple
[
bool
,
str
|
None
]:
if
c
.
weight_quant_key
not
in
{
kFp8StaticChannelSym
,
kFp8StaticTensorSym
}:
return
(
False
,
"XPUFP8ScaledMM only support per-channel and per-tensor quantization"
,
)
if
c
.
weight_quant_key
.
dtype
not
in
{
torch
.
float8_e5m2
,
torch
.
float8_e4m3fn
}:
if
c
.
weight_quant_key
.
dtype
not
in
{
torch
.
float8_e5m2
,
torch
.
float8_e4m3fn
}:
return
False
,
"XPUFP8ScaledMM only support FP8 weight dtype"
return
False
,
"XPUFP8ScaledMM only support FP8 weight dtype"
return
True
,
None
return
True
,
None
...
@@ -35,6 +45,9 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
...
@@ -35,6 +45,9 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
self
.
config
=
c
self
.
config
=
c
self
.
layer_param_names
=
layer_param_names
self
.
layer_param_names
=
layer_param_names
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
replace_parameter
(
layer
,
"weight"
,
layer
.
weight
.
data
.
t
())
def
apply_weights
(
def
apply_weights
(
self
,
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment