Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fce10dbe
Unverified
Commit
fce10dbe
authored
Aug 27, 2025
by
Kunshang Ji
Committed by
GitHub
Aug 27, 2025
Browse files
[XPU] Add xpu torch.compile support (#22609)
Signed-off-by:
Kunshang Ji
<
kunshang.ji@intel.com
>
parent
d272415e
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
36 additions
and
11 deletions
+36
-11
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-0
vllm/attention/layer.py
vllm/attention/layer.py
+1
-2
vllm/compilation/fix_functionalization.py
vllm/compilation/fix_functionalization.py
+8
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+4
-0
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+4
-0
vllm/platforms/interface.py
vllm/platforms/interface.py
+8
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+4
-0
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+6
-9
No files found.
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
fce10dbe
...
...
@@ -31,6 +31,7 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
...
...
vllm/attention/layer.py
View file @
fce10dbe
...
...
@@ -190,8 +190,7 @@ class Attention(nn.Module, AttentionLayerBase):
# torch.compile works by registering the attention as one giant
# opaque custom op. For other platforms, we directly call them
# and let torch.compile handle them.
self
.
use_direct_call
=
not
current_platform
.
is_cuda_alike
(
)
and
not
current_platform
.
is_cpu
()
self
.
use_direct_call
=
not
current_platform
.
opaque_attention_op
()
self
.
use_output
=
self
.
attn_backend
.
accept_output_buffer
compilation_config
=
get_current_vllm_config
().
compilation_config
...
...
vllm/compilation/fix_functionalization.py
View file @
fce10dbe
...
...
@@ -9,6 +9,7 @@ import torch
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
.fx_utils
import
is_func
from
.vllm_inductor_pass
import
VllmInductorPass
...
...
@@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass):
"""
def
__call__
(
self
,
graph
:
torch
.
fx
.
Graph
):
# XPU does not support auto-functionalization yet.
# Will enable this when switch to vllm-xpu-kernels.
if
current_platform
.
is_xpu
():
logger
.
debug
(
"XPU platform does not support fix functionalization"
"pass currently."
)
return
self
.
begin
()
self
.
dump_graph
(
graph
,
"before_fix_functionalization"
)
...
...
vllm/platforms/cpu.py
View file @
fce10dbe
...
...
@@ -335,3 +335,7 @@ class CpuPlatform(Platform):
return
(
cls
.
supports_v1
(
model_config
)
and
arch
in
(
CpuArchEnum
.
X86
,
CpuArchEnum
.
POWERPC
,
CpuArchEnum
.
ARM
,
CpuArchEnum
.
S390X
))
@
classmethod
def
opaque_attention_op
(
cls
)
->
bool
:
return
True
vllm/platforms/cuda.py
View file @
fce10dbe
...
...
@@ -442,6 +442,10 @@ class CudaPlatformBase(Platform):
def
use_custom_allreduce
(
cls
)
->
bool
:
return
True
@
classmethod
def
opaque_attention_op
(
cls
)
->
bool
:
return
True
@
classmethod
def
get_static_graph_wrapper_cls
(
cls
)
->
str
:
return
"vllm.compilation.cuda_graph.CUDAGraphWrapper"
...
...
vllm/platforms/interface.py
View file @
fce10dbe
...
...
@@ -509,6 +509,14 @@ class Platform:
"""
return
False
@
classmethod
def
opaque_attention_op
(
cls
)
->
bool
:
"""
Returns True if we register attention as one giant opaque custom op
on the current platform
"""
return
False
@
classmethod
def
validate_request
(
cls
,
...
...
vllm/platforms/rocm.py
View file @
fce10dbe
...
...
@@ -411,6 +411,10 @@ class RocmPlatform(Platform):
supported_archs
=
[
'gfx94'
,
'gfx95'
]
return
any
(
gfx
in
gcn_arch
for
gfx
in
supported_archs
)
@
classmethod
def
opaque_attention_op
(
cls
)
->
bool
:
return
True
@
classmethod
def
get_cu_count
(
cls
,
device_id
:
int
=
0
)
->
int
:
return
torch
.
cuda
.
get_device_properties
(
...
...
vllm/platforms/xpu.py
View file @
fce10dbe
...
...
@@ -90,21 +90,14 @@ class XPUPlatform(Platform):
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
64
# FIXME: Temporarily forcing eager mode
# remove after t.compile support stabilizes.
if
(
envs
.
VLLM_USE_V1
and
model_config
is
not
None
and
not
vllm_config
.
model_config
.
enforce_eager
):
from
vllm.config
import
CompilationLevel
vllm_config
.
compilation_config
.
level
=
CompilationLevel
.
NO_COMPILATION
# noqa: E501
# lazy import to avoid circular import
from
vllm.config
import
CUDAGraphMode
compilation_config
=
vllm_config
.
compilation_config
if
compilation_config
.
cudagraph_mode
is
None
or
\
compilation_config
.
cudagraph_mode
.
max_cudagraph_mode
()
\
!=
CUDAGraphMode
.
NONE
:
logger
.
info
(
"[XPU] CUDA graph is not supported on XPU, "
"
disabling
cudagraphs."
)
logger
.
info
(
"[XPU] CUDA graph is not supported on XPU,
disabling
"
"cudagraphs.
Fallback to cudagraph_mode=NONE
"
)
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
NONE
# check and update parallel config
...
...
@@ -182,3 +175,7 @@ class XPUPlatform(Platform):
"Intel Arc A770 have bfloat16 accuracy known issue. "
"You can use float16 instead by explicitly setting the "
"`dtype` flag in CLI, for example: --dtype=half."
)
@
classmethod
def
opaque_attention_op
(
cls
)
->
bool
:
return
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment