Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3cfa63ad
Unverified
Commit
3cfa63ad
authored
Nov 25, 2025
by
Yan Ma
Committed by
GitHub
Nov 24, 2025
Browse files
[XPU]fix Kimi-VL-A3B-thinking on xpu (#29309)
Signed-off-by:
Yan Ma
<
yan.ma@intel.com
>
parent
4d6afcad
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
6 deletions
+14
-6
vllm/model_executor/models/moonvit.py
vllm/model_executor/models/moonvit.py
+14
-6
No files found.
vllm/model_executor/models/moonvit.py
View file @
3cfa63ad
...
@@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
...
@@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.platforms
import
current_platform
from
vllm.transformers_utils.configs.moonvit
import
MoonViTConfig
from
vllm.transformers_utils.configs.moonvit
import
MoonViTConfig
if
is_flash_attn_2_available
():
if
is_flash_attn_2_available
():
from
flash_attn
import
flash_attn_varlen_func
from
flash_attn
import
flash_attn_varlen_func
elif
current_platform
.
is_xpu
():
from
vllm.attention.utils.fa_utils
import
flash_attn_varlen_func
else
:
else
:
flash_attn_varlen_func
=
None
flash_attn_varlen_func
=
None
...
@@ -106,10 +109,10 @@ def multihead_attention(
...
@@ -106,10 +109,10 @@ def multihead_attention(
q
,
q
,
k
,
k
,
v
,
v
,
q_cu_seqlens
,
cu_seqlens_q
=
q_cu_seqlens
,
k_cu_seqlens
,
cu_seqlens_k
=
k_cu_seqlens
,
max_seqlen_q
,
max_seqlen_q
=
max_seqlen_q
,
max_seqlen_k
,
max_seqlen_k
=
max_seqlen_k
,
causal
=
False
,
causal
=
False
,
)
)
attn_out
=
attn_out
.
flatten
(
start_dim
=-
2
)
attn_out
=
attn_out
.
flatten
(
start_dim
=-
2
)
...
@@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
...
@@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
"""
"""
def
__init__
(
def
__init__
(
self
,
dim
:
int
,
max_height
:
int
,
max_width
:
int
,
theta_base
=
10000
,
device
=
"cuda"
self
,
dim
:
int
,
max_height
:
int
,
max_width
:
int
,
theta_base
=
10000
,
device
=
current_platform
.
device_type
,
):
):
super
().
__init__
()
super
().
__init__
()
self
.
dim
=
dim
self
.
dim
=
dim
...
@@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
...
@@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
self
.
hidden_size_per_attention_head
=
self
.
hidden_dim
//
self
.
num_heads
self
.
hidden_size_per_attention_head
=
self
.
hidden_dim
//
self
.
num_heads
self
.
attn_implementation
=
attn_implementation
self
.
attn_implementation
=
attn_implementation
# use fa2 in vllm by default
# use fa2 in vllm by default
if
is_flash_attn_2_available
():
if
is_flash_attn_2_available
()
or
current_platform
.
is_xpu
()
:
self
.
attn_implementation
=
"flash_attention_2"
self
.
attn_implementation
=
"flash_attention_2"
self
.
norm0
=
nn
.
LayerNorm
(
hidden_dim
)
self
.
norm0
=
nn
.
LayerNorm
(
hidden_dim
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment