Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2ce9fe4a
Unverified
Commit
2ce9fe4a
authored
Feb 06, 2026
by
zofia
Committed by
GitHub
Feb 06, 2026
Browse files
[XPU][5/N] add wna16 xpu kernel (#33973)
Signed-off-by:
Zhu, Zufang
<
zufang.zhu@intel.com
>
parent
cd8b405b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
58 additions
and
66 deletions
+58
-66
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-0
vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
...ecutor/layers/quantization/kernels/mixed_precision/xpu.py
+57
-66
No files found.
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
2ce9fe4a
...
@@ -39,6 +39,7 @@ docker run \
...
@@ -39,6 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
cd tests
...
...
vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
View file @
2ce9fe4a
...
@@ -3,88 +3,71 @@
...
@@ -3,88 +3,71 @@
import
torch
import
torch
from
torch.nn.parameter
import
Parameter
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
.MPLinearKernel
import
MPLinearKernel
,
MPLinearLayerConfig
from
.MPLinearKernel
import
MPLinearKernel
,
MPLinearLayerConfig
_XPUWNA16_SUPPORTED_QUANT_TYPES
=
(
scalar_types
.
uint4
,
scalar_types
.
uint4b8
)
class
XPUwNa16LinearKernel
(
MPLinearKernel
):
class
XPUwNa16LinearKernel
(
MPLinearKernel
):
@
classmethod
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
def
get_min_capability
(
cls
)
->
int
:
return
0
return
-
1
@
classmethod
@
classmethod
def
can_implement
(
cls
,
c
:
MPLinearLayerConfig
)
->
tuple
[
bool
,
str
|
None
]:
def
can_implement
(
cls
,
c
:
MPLinearLayerConfig
)
->
tuple
[
bool
,
str
|
None
]:
if
not
current_platform
.
is_xpu
():
if
not
current_platform
.
is_xpu
():
return
False
,
"IPEX wNa16 only supported on XPU/CPU devices"
return
False
,
"XPUwNa16 only supported on XPU"
if
c
.
act_type
!=
torch
.
bfloat16
and
c
.
act_type
!=
torch
.
float16
:
return
False
,
"XPUwNa16 only supports BF16/FP16 activations"
# TODO: (yiliu30) relax these restrictions in later PRs
if
c
.
weight_type
not
in
_XPUWNA16_SUPPORTED_QUANT_TYPES
:
if
c
.
zero_points
:
return
(
return
False
,
"Zero points not supported for Now"
False
,
f
"Quant type (
{
c
.
weight_type
}
) not supported by "
"XPUwNa16, supported types are: "
f
"
{
_XPUWNA16_SUPPORTED_QUANT_TYPES
}
"
,
)
if
c
.
group_size
!=
-
1
and
c
.
group_size
%
32
!=
0
:
return
(
False
,
f
"Group size (
{
c
.
group_size
}
) not supported by "
"XPUwNa16, supported group sizes are multiples of 32"
,
)
if
c
.
partition_weight_shape
[
0
]
%
32
!=
0
:
return
(
False
,
f
"Input size (
{
c
.
partition_weight_shape
[
0
]
}
) not supported by "
"XPUwNa16, supported sizes are multiples of 32"
,
)
if
c
.
partition_weight_shape
[
1
]
%
32
!=
0
:
return
(
False
,
f
"Output size (
{
c
.
partition_weight_shape
[
1
]
}
) not supported by "
"XPUWNA16, supported sizes are multiples of 32"
,
)
return
True
,
None
return
True
,
None
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
):
from
packaging
import
version
layer
.
weight_scale
.
data
=
layer
.
weight_scale
.
t
().
contiguous
()
MIN_IPEX_VERSION
=
"2.6.0"
bias
=
layer
.
bias
if
not
layer
.
skip_bias_add
else
None
try
:
import
intel_extension_for_pytorch
as
ipex
if
version
.
parse
(
ipex
.
__version__
)
<
version
.
parse
(
MIN_IPEX_VERSION
):
raise
ImportError
(
"intel_extension_for_pytorch version is "
"wrong. Please install "
f
"intel_extension_for_pytorch>=
{
MIN_IPEX_VERSION
}
."
)
except
ImportError
as
err
:
raise
ImportError
(
"Please install "
f
"intel_extension_for_pytorch>=
{
MIN_IPEX_VERSION
}
via "
f
"`pip install intel_extension_for_pytorch>=
{
MIN_IPEX_VERSION
}
`"
" to use IPEX-AWQ linear method."
)
from
err
# Using the compute dtype (lowp_mode) as INT8 to leverage instructions
# with better performance.
lowp_mode
=
ipex
.
quantization
.
WoqLowpMode
.
INT8
# The weight will be de-packed from INT4 to INT8.
weight_dtype
=
ipex
.
quantization
.
WoqWeightDtype
.
INT4
# The float activation will be quantized (dynamic, per-token) to INT8.
act_quant_mode
=
ipex
.
quantization
.
WoqActQuantMode
.
PER_BATCH
qconfig
=
ipex
.
quantization
.
get_weight_only_quant_qconfig_mapping
(
weight_dtype
=
weight_dtype
,
lowp_mode
=
lowp_mode
,
act_quant_mode
=
act_quant_mode
,
group_size
=
self
.
config
.
group_size
,
weight_qscheme
=
ipex
.
quantization
.
WoqWeightQScheme
.
SYMMETRIC
,
)
qweight
=
layer
.
weight_packed
g_idx
=
layer
.
weight_g_idx
if
self
.
config
.
has_g_idx
else
None
scales
=
layer
.
weight_scale
qzeros
=
None
if
self
.
config
.
zero_points
:
if
self
.
config
.
zero_points
:
qzeros
=
layer
.
weight_zero_point
.
contiguous
()
layer
.
weight_zero_point
.
data
=
layer
.
weight_zero_point
.
t
().
contiguous
()
qweight
=
qweight
.
t
().
contiguous
()
else
:
scales
=
scales
.
t
().
contiguous
()
weight_zero_point
=
torch
.
Tensor
([
8
]).
to
(
torch
.
int8
).
to
(
"xpu"
)
layer
.
ipex_output_size
=
self
.
config
.
partition_weight_shape
[
1
]
layer
.
weight_zero_point
=
Parameter
(
weight_zero_point
,
requires_grad
=
False
)
layer
.
ipex_qlinear
=
(
if
self
.
config
.
has_g_idx
:
ipex
.
llm
.
quantization
.
woq_linear
.
IPEXWeightOnlyQuantizedLinear
.
from_weight
(
layer
.
g_idx
.
data
=
layer
.
g_idx
.
t
().
contiguous
()
qweight
,
else
:
scales
,
layer
.
g_idx
=
None
qzeros
,
in_features
=
self
.
config
.
partition_weight_shape
[
0
],
out_features
=
self
.
config
.
partition_weight_shape
[
1
],
qconfig
=
qconfig
,
g_idx
=
g_idx
,
bias
=
bias
,
group_size
=
self
.
config
.
group_size
,
quant_method
=
0
,
# `0` stands for the IPEX GPTQ
)
)
def
apply_weights
(
def
apply_weights
(
self
,
self
,
...
@@ -93,5 +76,13 @@ class XPUwNa16LinearKernel(MPLinearKernel):
...
@@ -93,5 +76,13 @@ class XPUwNa16LinearKernel(MPLinearKernel):
bias
:
torch
.
Tensor
|
None
=
None
,
bias
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
out
=
layer
.
ipex_qlinear
(
reshaped_x
)
out
=
torch
.
ops
.
_xpu_C
.
int4_gemm_w4a16
(
return
out
.
reshape
(
x
.
shape
[:
-
1
]
+
(
layer
.
ipex_output_size
,))
reshaped_x
,
layer
.
weight_packed
.
t
(),
bias
,
layer
.
weight_scale
,
layer
.
weight_zero_point
,
self
.
config
.
group_size
,
layer
.
g_idx
,
)
return
out
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment