Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4fd5389b
Commit
4fd5389b
authored
Jun 12, 2025
by
zhuwenwen
Browse files
update qwen2&2.5-vl prefill interface and fix pixtral run error
parent
504a12b8
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
18 additions
and
12 deletions
+18
-12
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+1
-1
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+7
-4
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+7
-4
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+3
-3
No files found.
vllm/model_executor/models/pixtral.py
View file @
4fd5389b
...
...
@@ -1045,7 +1045,7 @@ class PixtralHFAttention(nn.Module):
q
,
k
,
v
,
attn_mask
=
attention_mask
)
out
=
out
.
transpose
(
1
,
2
)
out
=
out
.
view
(
batch
,
patches
,
self
.
n_heads
*
self
.
head_dim
)
out
=
out
.
reshape
(
batch
,
patches
,
self
.
n_heads
*
self
.
head_dim
)
attn_output
,
_
=
self
.
o_proj
(
out
)
return
attn_output
,
None
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
4fd5389b
...
...
@@ -73,6 +73,7 @@ import os
import
re
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -311,8 +312,10 @@ class Qwen2_5_VisionAttention(nn.Module):
use_flash_attn
=
use_flash_attn
)
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
# from vllm_flash_attn.flash_attn_interface import (
# flash_attn_varlen_func)
if
not
current_platform
.
is_rocm
():
from
vllm_flash_attn.flash_attn_interface
import
(
flash_attn_varlen_func
)
else
:
from
flash_attn
import
flash_attn_varlen_func
q
,
k
,
v
=
(
rearrange
(
x
,
"b s ... -> (b s) ..."
)
for
x
in
[
q
,
k
,
v
])
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
4fd5389b
...
...
@@ -81,6 +81,7 @@ import os
import
re
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -330,8 +331,10 @@ class Qwen2VisionAttention(nn.Module):
k
=
apply_rotary_pos_emb_vision
(
k
,
rotary_pos_emb
)
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
# from vllm_flash_attn.flash_attn_interface import (
# flash_attn_varlen_func)
if
not
current_platform
.
is_rocm
():
from
vllm_flash_attn.flash_attn_interface
import
(
flash_attn_varlen_func
)
else
:
from
flash_attn
import
flash_attn_varlen_func
q
,
k
,
v
=
(
rearrange
(
x
,
"b s ... -> (b s) ..."
)
for
x
in
[
q
,
k
,
v
])
...
...
vllm/model_executor/models/vision.py
View file @
4fd5389b
...
...
@@ -83,11 +83,11 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
if
backend_by_env_var
is
not
None
:
selected_backend
=
backend_name_to_enum
(
backend_by_env_var
)
if
selected_backend
is
None
:
if
current_platform
.
is_cuda
():
if
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()
:
device_available
=
current_platform
.
has_device_capability
(
80
)
if
device_available
and
support_fa
:
from
transformers.utils
import
is_flash_attn_2_available
if
is_flash_attn_2_available
():
if
is_flash_attn_2_available
()
or
current_platform
.
is_rocm
()
:
selected_backend
=
_Backend
.
FLASH_ATTN
else
:
logger
.
warning_once
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment