Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eefa41c1
Commit
eefa41c1
authored
Mar 24, 2026
by
zhuwenwen
Browse files
sync v0.18.0
parent
82155c76
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
245 additions
and
310 deletions
+245
-310
vllm/model_executor/models/qwen3.py
vllm/model_executor/models/qwen3.py
+1
-1
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+2
-2
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+2
-2
vllm/model_executor/models/qwen3_next_mtp.py
vllm/model_executor/models/qwen3_next_mtp.py
+1
-1
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
+214
-281
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+3
-3
vllm/model_executor/models/qwen3_vl_moe.py
vllm/model_executor/models/qwen3_vl_moe.py
+1
-1
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+1
-1
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+4
-2
vllm/model_executor/models/seed_oss.py
vllm/model_executor/models/seed_oss.py
+2
-2
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+1
-1
vllm/model_executor/models/solar.py
vllm/model_executor/models/solar.py
+1
-1
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/stablelm.py
+2
-2
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-2
vllm/model_executor/models/step3_text.py
vllm/model_executor/models/step3_text.py
+2
-2
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+1
-1
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+1
-1
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/base.py
+1
-1
vllm/model_executor/models/transformers/moe.py
vllm/model_executor/models/transformers/moe.py
+2
-2
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+1
-1
No files found.
vllm/model_executor/models/qwen3.py
View file @
eefa41c1
...
@@ -314,7 +314,7 @@ class Qwen3ForCausalLM(
...
@@ -314,7 +314,7 @@ class Qwen3ForCausalLM(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
eefa41c1
...
@@ -474,7 +474,7 @@ class Qwen3MoeModel(nn.Module, EagleModelMixin):
...
@@ -474,7 +474,7 @@ class Qwen3MoeModel(nn.Module, EagleModelMixin):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -760,7 +760,7 @@ class Qwen3MoeForCausalLM(
...
@@ -760,7 +760,7 @@ class Qwen3MoeForCausalLM(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/qwen3_next.py
View file @
eefa41c1
...
@@ -1357,7 +1357,7 @@ class Qwen3NextModel(nn.Module):
...
@@ -1357,7 +1357,7 @@ class Qwen3NextModel(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -1604,7 +1604,7 @@ class Qwen3NextForCausalLM(
...
@@ -1604,7 +1604,7 @@ class Qwen3NextForCausalLM(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/qwen3_next_mtp.py
View file @
eefa41c1
...
@@ -261,7 +261,7 @@ class Qwen3NextMTP(nn.Module, QwenNextMixtureOfExperts):
...
@@ -261,7 +261,7 @@ class Qwen3NextMTP(nn.Module, QwenNextMixtureOfExperts):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
vllm/model_executor/models/qwen3_omni_moe_thinker.py
View file @
eefa41c1
...
@@ -22,7 +22,7 @@
...
@@ -22,7 +22,7 @@
# limitations under the License.
# limitations under the License.
"""Inference-only Qwen3-Omni-Moe model (thinker part)."""
"""Inference-only Qwen3-Omni-Moe model (thinker part)."""
from
collections.abc
import
Callable
,
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Callable
,
Iterable
,
Iterator
,
Mapping
,
Sequence
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
Literal
,
cast
from
typing
import
Any
,
Literal
,
cast
...
@@ -109,10 +109,7 @@ from .utils import (
...
@@ -109,10 +109,7 @@ from .utils import (
_merge_multimodal_embeddings
,
_merge_multimodal_embeddings
,
maybe_prefix
,
maybe_prefix
,
)
)
from
.vision
import
(
from
.vision
import
get_vit_attn_backend
get_llm_pos_ids_for_vision
,
get_vit_attn_backend
,
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -1071,7 +1068,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
...
@@ -1071,7 +1068,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -1974,7 +1971,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
...
@@ -1974,7 +1971,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -2019,136 +2016,152 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
...
@@ -2019,136 +2016,152 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
return
loaded_weights
return
loaded_weights
def
get_mrope_input_positions
(
def
_compute_audio_token_count
(
self
,
audio_feature_length
:
int
)
->
int
:
self
,
"""Compute audio tokens from feature length using Qwen3-Omni formula."""
input_tokens
:
list
[
int
],
return
_get_feat_extract_output_lengths
(
mm_features
:
list
[
MultiModalFeatureSpec
],
torch
.
tensor
([
audio_feature_length
])
)
->
tuple
[
torch
.
Tensor
,
int
]:
).
item
()
kwargs
=
MultiModalFeatureSpec
.
gather_kwargs
(
mm_features
,
{
"image_grid_thw"
,
"video_grid_thw"
,
"second_per_grid_ts"
,
"audio_feature_lengths"
,
"use_audio_in_video"
,
},
)
image_grid_thw
=
kwargs
.
get
(
"image_grid_thw"
,
[])
video_grid_thw
=
kwargs
.
get
(
"video_grid_thw"
,
[])
second_per_grid_ts
=
kwargs
.
get
(
"second_per_grid_ts"
,
[])
audio_feature_lengths
=
kwargs
.
get
(
"audio_feature_lengths"
,
[])
use_audio_in_video
=
any
(
kwargs
.
get
(
"use_audio_in_video"
,
[]))
image_grid_thw
=
(
torch
.
stack
if
image_grid_thw
else
torch
.
tensor
)(
image_grid_thw
)
video_grid_thw
=
(
torch
.
stack
if
video_grid_thw
else
torch
.
tensor
)(
video_grid_thw
)
input_ids
=
torch
.
tensor
(
input_tokens
)
if
input_ids
is
None
or
input_ids
.
ndim
!=
1
:
raise
ValueError
(
"_omni3_get_input_positions_tensor expects 1D input_ids"
)
seq_len
=
input_ids
.
shape
[
0
]
def
_get_audio_for_video_mapping
(
self
,
mm_features
:
list
[
MultiModalFeatureSpec
]
)
->
tuple
[
dict
[
int
,
int
],
set
[
int
]]:
"""
Map video offset -> paired audio_feature_length for use_audio_in_video.
if
isinstance
(
audio_feature_lengths
,
list
):
When use_audio_in_video=True, audio is interleaved within video.
audio_feature_lengths
=
torch
.
tensor
(
The pairing is based on feature order in mm_features.
audio_feature_lengths
,
dtype
=
torch
.
long
)
if
not
len
(
second_per_grid_ts
)
and
len
(
video_grid_thw
):
Returns:
second_per_grid_ts
=
2.0
Tuple of (video_offset -> audio_feature_length mapping,
second_per_grids
=
(
set of paired audio offsets to skip)
torch
.
ones
(
len
(
video_grid_thw
),
dtype
=
torch
.
float32
)
"""
*
second_per_grid_ts
videos_with_audio
=
[
)
f
else
:
for
f
in
mm_features
second_per_grids
=
torch
.
tensor
(
second_per_grid_ts
,
dtype
=
torch
.
float32
)
if
f
.
modality
==
"video"
and
f
.
data
.
get
(
"use_audio_in_video"
)
and
f
.
data
[
"use_audio_in_video"
].
data
.
item
()
]
audios
=
[
f
for
f
in
mm_features
if
f
.
modality
==
"audio"
]
mapping
:
dict
[
int
,
int
]
=
{}
paired_audio_offsets
:
set
[
int
]
=
set
()
for
i
,
video_f
in
enumerate
(
videos_with_audio
):
if
i
<
len
(
audios
):
audio_len
=
audios
[
i
].
data
[
"audio_feature_lengths"
].
data
.
item
()
mapping
[
video_f
.
mm_position
.
offset
]
=
audio_len
paired_audio_offsets
.
add
(
audios
[
i
].
mm_position
.
offset
)
return
mapping
,
paired_audio_offsets
def
iter_mm_features
(
self
,
mm_features
:
list
[
MultiModalFeatureSpec
]
)
->
Iterator
[
tuple
[
int
,
str
,
dict
[
str
,
Any
]]]:
"""
Iterate over multimodal features sorted by position offset.
Yields: (offset, modality, feature_data) where feature_data contains:
- image: {"grid_t", "grid_h", "grid_w", "t_factor"}
- video: {"grid_t", "grid_h", "grid_w", "t_factor",
"use_audio_in_video", "audio_feature_length"}
- audio: {"audio_feature_length"}
"""
config
=
self
.
config
config
=
self
.
config
spatial_merge_size
=
config
.
vision_config
.
spatial_merge_size
spatial_merge_size
=
config
.
vision_config
.
spatial_merge_size
image_token_id
=
config
.
image_token_id
video_token_id
=
config
.
video_token_id
audio_token_id
=
config
.
audio_token_id
vision_start_token_id
=
config
.
vision_start_token_id
audio_start_token_id
=
config
.
audio_start_token_id
position_id_per_seconds
=
config
.
position_id_per_seconds
position_id_per_seconds
=
config
.
position_id_per_seconds
vision_start_indices
=
torch
.
argwhere
(
sorted_features
=
sorted
(
mm_features
,
key
=
lambda
f
:
f
.
mm_position
.
offset
)
input_ids
==
vision_start_token_id
audio_for_video
,
paired_audio_offsets
=
self
.
_get_audio_for_video_mapping
(
).
squeeze
(
1
)
sorted_features
if
vision_start_indices
.
numel
()
>
0
:
vision_tokens
=
input_ids
[
vision_start_indices
+
1
]
else
:
vision_tokens
=
input_ids
.
new_empty
((
0
,),
dtype
=
input_ids
.
dtype
)
audio_nums
=
torch
.
sum
(
input_ids
==
audio_start_token_id
)
image_nums
=
(
vision_tokens
==
image_token_id
).
sum
()
video_nums
=
(
(
vision_tokens
==
audio_start_token_id
).
sum
()
if
use_audio_in_video
else
(
vision_tokens
==
video_token_id
).
sum
()
)
)
llm_pos_ids_list
:
list
[
torch
.
Tensor
]
=
[]
for
mm_feature
in
sorted_features
:
st
=
0
offset
=
mm_feature
.
mm_position
.
offset
image_idx
=
0
modality
=
mm_feature
.
modality
video_idx
=
0
audio_idx
=
0
remain_images
,
remain_videos
,
remain_audios
=
image_nums
,
video_nums
,
audio_nums
# noqa: E501
multimodal_nums
=
(
image_nums
+
audio_nums
if
use_audio_in_video
else
image_nums
+
video_nums
+
audio_nums
)
# noqa: E501
for
_
in
range
(
multimodal_nums
):
if
modality
==
"image"
:
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
t
,
h
,
w
=
mm_feature
.
data
[
"image_grid_thw"
].
data
.
tolist
()
if
(
image_token_id
in
input_tokens
or
video_token_id
in
input_tokens
)
and
(
yield
(
remain_videos
>
0
or
remain_images
>
0
offset
,
):
"image"
,
ed_vision_start
=
input_tokens
.
index
(
vision_start_token_id
,
st
)
{
else
:
"grid_t"
:
t
,
ed_vision_start
=
len
(
input_tokens
)
+
1
"grid_h"
:
h
//
spatial_merge_size
,
if
audio_token_id
in
input_tokens
and
remain_audios
>
0
:
"grid_w"
:
w
//
spatial_merge_size
,
ed_audio_start
=
input_tokens
.
index
(
audio_start_token_id
,
st
)
"t_factor"
:
position_id_per_seconds
,
else
:
},
ed_audio_start
=
len
(
input_tokens
)
+
1
min_ed
=
min
(
ed_vision_start
,
ed_audio_start
)
if
min_ed
==
ed_audio_start
:
text_len
=
min_ed
-
st
if
text_len
!=
0
:
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
llm_pos_ids_list
.
append
(
torch
.
arange
(
text_len
,
dtype
=
torch
.
long
)
.
view
(
1
,
-
1
)
.
expand
(
3
,
-
1
)
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
bos_len
=
1
llm_pos_ids_list
.
append
(
torch
.
arange
(
bos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
audio_len
=
_get_feat_extract_output_lengths
(
audio_feature_lengths
[
audio_idx
]
)
)
llm_pos_ids
=
(
elif
modality
==
"video"
:
torch
.
arange
(
audio_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
t
,
h
,
w
=
mm_feature
.
data
[
"video_grid_thw"
].
data
.
tolist
()
+
st_idx
second_per_grid_ts
=
2.0
if
mm_feature
.
data
.
get
(
"second_per_grid_ts"
):
second_per_grid_ts
=
mm_feature
.
data
[
"second_per_grid_ts"
].
data
.
item
()
use_audio_in_video
=
bool
(
mm_feature
.
data
.
get
(
"use_audio_in_video"
)
and
mm_feature
.
data
[
"use_audio_in_video"
].
data
.
item
()
)
)
llm_pos_ids_list
.
append
(
llm_pos_ids
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
yield
(
eos_len
=
1
offset
,
llm_pos_ids_list
.
append
(
"video"
,
torch
.
arange
(
eos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
{
+
st_idx
"grid_t"
:
t
,
"grid_h"
:
h
//
spatial_merge_size
,
"grid_w"
:
w
//
spatial_merge_size
,
"t_factor"
:
second_per_grid_ts
*
position_id_per_seconds
,
"use_audio_in_video"
:
use_audio_in_video
,
"audio_feature_length"
:
audio_for_video
.
get
(
offset
),
},
)
)
st
+=
text_len
+
bos_len
+
audio_len
+
eos_len
elif
modality
==
"audio"
:
if
offset
not
in
paired_audio_offsets
:
audio_len
=
mm_feature
.
data
[
"audio_feature_lengths"
].
data
.
item
()
yield
offset
,
"audio"
,
{
"audio_feature_length"
:
audio_len
}
def
_compute_interleaved_positions
(
self
,
start_idx
:
int
,
data
:
dict
[
str
,
Any
]
)
->
tuple
[
np
.
ndarray
,
int
]:
"""
Compute positions for interleaved video+audio using Qwen3 token-by-token
interleaving logic.
Returns: (position_ids [3, N], total_token_count)
"""
grid_t
=
data
[
"grid_t"
]
grid_h
=
data
[
"grid_h"
]
grid_w
=
data
[
"grid_w"
]
t_factor
=
data
[
"t_factor"
]
audio_feature_length
=
data
[
"audio_feature_length"
]
audio_len
=
self
.
_compute_audio_token_count
(
audio_feature_length
)
h_index
=
np
.
tile
(
np
.
arange
(
grid_h
).
reshape
(
1
,
-
1
,
1
),
(
grid_t
,
1
,
grid_w
)
).
flatten
()
w_index
=
np
.
tile
(
np
.
arange
(
grid_w
).
reshape
(
1
,
1
,
-
1
),
(
grid_t
,
grid_h
,
1
)
).
flatten
()
t_index_raw
=
np
.
arange
(
grid_t
)
t_index_scaled
=
(
t_index_raw
*
t_factor
).
astype
(
np
.
int64
)
t_index
=
np
.
repeat
(
t_index_scaled
,
grid_h
*
grid_w
)
video_pos
=
np
.
stack
([
t_index
,
h_index
,
w_index
])
+
start_idx
audio_pos
=
np
.
broadcast_to
(
np
.
arange
(
audio_len
),
(
3
,
audio_len
))
+
start_idx
video_t_values
=
video_pos
[
0
]
audio_t_values
=
audio_pos
[
0
]
pos_ids_list
:
list
[
np
.
ndarray
]
=
[]
video_idx
,
audio_idx
=
0
,
0
num_video
=
grid_t
*
grid_h
*
grid_w
while
video_idx
<
num_video
and
audio_idx
<
audio_len
:
if
video_t_values
[
video_idx
]
<=
audio_t_values
[
audio_idx
]:
pos_ids_list
.
append
(
video_pos
[:,
video_idx
:
video_idx
+
1
])
video_idx
+=
1
else
:
pos_ids_list
.
append
(
audio_pos
[:,
audio_idx
:
audio_idx
+
1
])
audio_idx
+=
1
audio_idx
+=
1
if
video_idx
<
num_video
:
if
video_idx
<
num_video
:
...
@@ -2247,175 +2260,95 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
...
@@ -2247,175 +2260,95 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
if
text_len
>
0
:
if
text_len
>
0
:
llm_pos_ids_list
.
append
(
llm_pos_ids_list
.
append
(
torch
.
arange
(
bos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
np
.
broadcast_to
(
np
.
arange
(
text_len
),
(
3
,
text_len
))
+
st_idx
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
grid_t
=
image_grid_thw
[
image_idx
][
0
]
grid_hs
=
image_grid_thw
[:,
1
]
grid_ws
=
image_grid_thw
[:,
2
]
t_index
=
torch
.
arange
(
grid_t
)
*
position_id_per_seconds
llm_pos_ids
=
get_llm_pos_ids_for_vision
(
st_idx
,
image_idx
,
spatial_merge_size
,
t_index
,
grid_hs
,
grid_ws
)
image_len
=
image_grid_thw
[
image_idx
].
prod
()
//
(
spatial_merge_size
**
2
)
llm_pos_ids_list
.
append
(
llm_pos_ids
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
eos_len
=
1
llm_pos_ids_list
.
append
(
torch
.
arange
(
eos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
st
+=
text_len
+
bos_len
+
image_len
+
eos_len
image_idx
+=
1
remain_images
-=
1
elif
(
min_ed
==
ed_vision_start
and
input_ids
[
ed_vision_start
+
1
]
==
video_token_id
and
not
use_audio_in_video
):
text_len
=
min_ed
-
st
if
text_len
!=
0
:
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
llm_pos_ids_list
.
append
(
torch
.
arange
(
text_len
,
dtype
=
torch
.
long
)
.
view
(
1
,
-
1
)
.
expand
(
3
,
-
1
)
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
bos_len
=
1
llm_pos_ids_list
.
append
(
torch
.
arange
(
bos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
grid_t
=
video_grid_thw
[
video_idx
][
0
]
grid_hs
=
video_grid_thw
[:,
1
]
grid_ws
=
video_grid_thw
[:,
2
]
t_index
=
(
torch
.
arange
(
grid_t
)
*
float
(
second_per_grids
[
video_idx
].
item
())
*
position_id_per_seconds
)
llm_pos_ids
=
get_llm_pos_ids_for_vision
(
st_idx
,
video_idx
,
spatial_merge_size
,
t_index
,
grid_hs
,
grid_ws
)
video_len
=
video_grid_thw
[
video_idx
].
prod
()
//
(
spatial_merge_size
**
2
)
llm_pos_ids_list
.
append
(
llm_pos_ids
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
eos_len
=
1
llm_pos_ids_list
.
append
(
torch
.
arange
(
eos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
st
+=
text_len
+
bos_len
+
video_len
+
eos_len
video_idx
+=
1
remain_videos
-=
1
elif
(
min_ed
==
ed_vision_start
and
ed_vision_start
+
1
==
ed_audio_start
and
use_audio_in_video
):
text_len
=
min_ed
-
st
if
text_len
!=
0
:
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
llm_pos_ids_list
.
append
(
torch
.
arange
(
text_len
,
dtype
=
torch
.
long
)
.
view
(
1
,
-
1
)
.
expand
(
3
,
-
1
)
+
st_idx
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
bos_len
=
1
bos_block
=
(
torch
.
arange
(
bos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
llm_pos_ids_list
.
append
(
bos_block
)
llm_pos_ids_list
.
append
(
bos_block
)
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
audio_len
=
_get_feat_extract_output_lengths
(
audio_feature_lengths
[
audio_idx
]
)
)
audio_llm_pos_ids
=
(
st_idx
+=
text_len
torch
.
arange
(
audio_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
bos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
]),
(
3
,
1
))
)
llm_pos_ids_list
.
append
(
bos_pos
)
grid_t
=
video_grid_thw
[
video_idx
][
0
]
st_idx
+=
1
grid_hs
=
video_grid_thw
[:,
1
]
grid_ws
=
video_grid_thw
[:,
2
]
if
modality
==
"audio"
:
t_index
=
(
audio_tokens
=
self
.
_compute_audio_token_count
(
torch
.
arange
(
grid_t
)
data
[
"audio_feature_length"
]
*
float
(
second_per_grids
[
video_idx
].
item
())
*
position_id_per_seconds
)
)
video_llm_pos_ids
=
get_llm_pos_ids_for_vision
(
audio_pos
=
(
st_idx
,
video_idx
,
spatial_merge_size
,
t_index
,
grid_hs
,
grid_ws
np
.
broadcast_to
(
np
.
arange
(
audio_tokens
),
(
3
,
audio_tokens
))
+
st_idx
)
)
video_data_index
,
audio_data_index
=
0
,
0
llm_pos_ids_list
.
append
(
audio_pos
)
while
(
st_idx
=
int
(
audio_pos
.
max
())
+
1
video_data_index
<
video_llm_pos_ids
.
shape
[
-
1
]
and
audio_data_index
<
audio_llm_pos_ids
.
shape
[
-
1
]
eos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
]),
(
3
,
1
))
):
llm_pos_ids_list
.
append
(
eos_pos
)
if
(
st
=
offset
+
1
+
audio_tokens
+
1
video_llm_pos_ids
[
0
][
video_data_index
]
<=
audio_llm_pos_ids
[
0
][
audio_data_index
]
elif
modality
==
"image"
:
):
grid_t
=
data
[
"grid_t"
]
llm_pos_ids_list
.
append
(
grid_h
=
data
[
"grid_h"
]
video_llm_pos_ids
[
grid_w
=
data
[
"grid_w"
]
:,
video_data_index
:
video_data_index
+
1
t_factor
=
data
[
"t_factor"
]
]
)
grid_indices
=
np
.
indices
((
grid_t
,
grid_h
,
grid_w
))
video_data_index
+=
1
if
t_factor
!=
1.0
:
else
:
grid_indices
[
0
]
=
(
grid_indices
[
0
]
*
t_factor
).
astype
(
np
.
int64
)
llm_pos_ids_list
.
append
(
llm_pos_ids_list
.
append
(
grid_indices
.
reshape
(
3
,
-
1
)
+
st_idx
)
audio_llm_pos_ids
[
:,
audio_data_index
:
audio_data_index
+
1
image_len
=
grid_t
*
grid_h
*
grid_w
]
st_idx
=
int
(
llm_pos_ids_list
[
-
1
].
max
())
+
1
)
audio_data_index
+=
1
eos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
]),
(
3
,
1
))
if
video_data_index
<
video_llm_pos_ids
.
shape
[
-
1
]:
llm_pos_ids_list
.
append
(
eos_pos
)
llm_pos_ids_list
.
append
(
st
=
offset
+
1
+
image_len
+
1
video_llm_pos_ids
[
:,
video_data_index
:
video_llm_pos_ids
.
shape
[
-
1
]
elif
modality
==
"video"
:
]
grid_t
=
data
[
"grid_t"
]
)
grid_h
=
data
[
"grid_h"
]
if
audio_data_index
<
audio_llm_pos_ids
.
shape
[
-
1
]:
grid_w
=
data
[
"grid_w"
]
llm_pos_ids_list
.
append
(
t_factor
=
data
[
"t_factor"
]
audio_llm_pos_ids
[
:,
audio_data_index
:
audio_llm_pos_ids
.
shape
[
-
1
]
if
not
data
[
"use_audio_in_video"
]:
]
grid_indices
=
np
.
indices
((
grid_t
,
grid_h
,
grid_w
))
if
t_factor
!=
1.0
:
grid_indices
[
0
]
=
(
grid_indices
[
0
]
*
t_factor
).
astype
(
np
.
int64
)
llm_pos_ids_list
.
append
(
grid_indices
.
reshape
(
3
,
-
1
)
+
st_idx
)
video_len
=
grid_t
*
grid_h
*
grid_w
st_idx
=
int
(
llm_pos_ids_list
[
-
1
].
max
())
+
1
eos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
]),
(
3
,
1
))
llm_pos_ids_list
.
append
(
eos_pos
)
st
=
offset
+
1
+
video_len
+
1
else
:
audio_bos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
-
1
]),
(
3
,
1
))
llm_pos_ids_list
.
append
(
audio_bos_pos
)
pos_ids
,
_
=
self
.
_compute_interleaved_positions
(
st_idx
,
data
)
llm_pos_ids_list
.
append
(
pos_ids
)
st_idx
=
int
(
pos_ids
.
max
())
+
1
eos_pos
=
np
.
broadcast_to
(
np
.
array
([
st_idx
]),
(
3
,
1
))
llm_pos_ids_list
.
append
(
eos_pos
)
llm_pos_ids_list
.
append
(
eos_pos
)
video_len
=
grid_t
*
grid_h
*
grid_w
audio_len
=
self
.
_compute_audio_token_count
(
data
[
"audio_feature_length"
]
)
)
video_len
=
video_grid_thw
[
video_idx
].
prod
()
//
(
spatial_merge_size
**
2
)
st
=
offset
+
2
+
video_len
+
audio_len
+
2
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
eos_len
=
1
eos_block
=
(
torch
.
arange
(
eos_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
+
st_idx
)
llm_pos_ids_list
.
append
(
eos_block
)
llm_pos_ids_list
.
append
(
eos_block
)
st
+=
text_len
+
bos_len
*
2
+
audio_len
+
video_len
+
eos_len
*
2
# noqa: E501
audio_idx
+=
1
video_idx
+=
1
remain_videos
-=
1
remain_audios
-=
1
if
st
<
len
(
input_tokens
)
:
if
st
<
seq_len
:
st_idx
=
llm_pos_ids_list
[
-
1
].
max
()
+
1
if
llm_pos_ids_list
else
0
st_idx
=
int
(
llm_pos_ids_list
[
-
1
].
max
()
)
+
1
if
llm_pos_ids_list
else
0
text_len
=
len
(
input_tokens
)
-
st
text_len
=
seq_len
-
st
llm_pos_ids_list
.
append
(
llm_pos_ids_list
.
append
(
torch
.
arange
(
text_len
,
dtype
=
torch
.
long
).
view
(
1
,
-
1
).
expand
(
3
,
-
1
)
np
.
broadcast_to
(
np
.
arange
(
text_len
),
(
3
,
text_len
))
+
st_idx
+
st_idx
)
)
llm_positions
=
torch
.
c
at
(
llm_pos_ids_list
,
dim
=
1
).
reshape
(
3
,
-
1
)
llm_positions
=
np
.
concaten
at
e
(
llm_pos_ids_list
,
axis
=
1
).
reshape
(
3
,
-
1
)
if
llm_positions
.
shape
[
1
]
!=
seq_len
:
if
llm_positions
.
shape
[
1
]
!=
seq_len
:
raise
RuntimeError
(
"Position ids length mismatch with input ids length"
)
raise
RuntimeError
(
"Position ids length mismatch with input ids length"
)
mrope_position_delta
=
llm_positions
.
max
()
+
1
-
seq_len
mrope_position_delta
=
int
(
llm_positions
.
max
()
)
+
1
-
seq_len
return
llm_positions
,
mrope_position_delta
return
torch
.
from_numpy
(
llm_positions
)
,
mrope_position_delta
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
"""
"""
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
eefa41c1
...
@@ -1257,7 +1257,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
...
@@ -1257,7 +1257,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
class
Qwen3LLMModel
(
Qwen3Model
):
class
Qwen3LLMModel
(
Qwen3Model
):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -2245,7 +2245,7 @@ class Qwen3VLForConditionalGeneration(
...
@@ -2245,7 +2245,7 @@ class Qwen3VLForConditionalGeneration(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -2341,4 +2341,4 @@ class Qwen3VLForConditionalGeneration(
...
@@ -2341,4 +2341,4 @@ class Qwen3VLForConditionalGeneration(
@
lru_cache
@
lru_cache
def
_cached_tensor
(
x
,
device
)
->
torch
.
Tensor
:
def
_cached_tensor
(
x
,
device
)
->
torch
.
Tensor
:
return
torch
.
tensor
(
x
,
device
=
device
)
return
torch
.
tensor
(
x
,
device
=
device
)
\ No newline at end of file
vllm/model_executor/models/qwen3_vl_moe.py
View file @
eefa41c1
...
@@ -85,7 +85,7 @@ class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
...
@@ -85,7 +85,7 @@ class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
class
Qwen3MoeLLMModel
(
Qwen3MoeModel
):
class
Qwen3MoeLLMModel
(
Qwen3MoeModel
):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/qwen_vl.py
View file @
eefa41c1
...
@@ -665,7 +665,7 @@ class QwenVLForConditionalGeneration(
...
@@ -665,7 +665,7 @@ class QwenVLForConditionalGeneration(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/registry.py
View file @
eefa41c1
...
@@ -379,8 +379,9 @@ _MULTIMODAL_MODELS = {
...
@@ -379,8 +379,9 @@ _MULTIMODAL_MODELS = {
),
),
"GlmAsrForConditionalGeneration"
:
(
"glmasr"
,
"GlmAsrForConditionalGeneration"
),
"GlmAsrForConditionalGeneration"
:
(
"glmasr"
,
"GlmAsrForConditionalGeneration"
),
"GLM4VForCausalLM"
:
(
"glm4v"
,
"GLM4VForCausalLM"
),
"GLM4VForCausalLM"
:
(
"glm4v"
,
"GLM4VForCausalLM"
),
"Glm4vForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vForConditionalGeneration"
),
# noqa: E501
"Glm4vForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vForConditionalGeneration"
),
"Glm4vMoeForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vMoeForConditionalGeneration"
),
# noqa: E501
"Glm4vMoeForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vMoeForConditionalGeneration"
),
"GlmOcrForConditionalGeneration"
:
(
"glm_ocr"
,
"GlmOcrForConditionalGeneration"
),
# noqa: E501
"GraniteSpeechForConditionalGeneration"
:
(
"GraniteSpeechForConditionalGeneration"
:
(
"granite_speech"
,
"granite_speech"
,
"GraniteSpeechForConditionalGeneration"
,
"GraniteSpeechForConditionalGeneration"
,
...
@@ -564,6 +565,7 @@ _SPECULATIVE_DECODING_MODELS = {
...
@@ -564,6 +565,7 @@ _SPECULATIVE_DECODING_MODELS = {
"LongCatFlashMTPModel"
:
(
"longcat_flash_mtp"
,
"LongCatFlashMTP"
),
"LongCatFlashMTPModel"
:
(
"longcat_flash_mtp"
,
"LongCatFlashMTP"
),
"Glm4MoeMTPModel"
:
(
"glm4_moe_mtp"
,
"Glm4MoeMTP"
),
"Glm4MoeMTPModel"
:
(
"glm4_moe_mtp"
,
"Glm4MoeMTP"
),
"Glm4MoeLiteMTPModel"
:
(
"glm4_moe_lite_mtp"
,
"Glm4MoeLiteMTP"
),
"Glm4MoeLiteMTPModel"
:
(
"glm4_moe_lite_mtp"
,
"Glm4MoeLiteMTP"
),
"GlmOcrMTPModel"
:
(
"glm_ocr_mtp"
,
"GlmOcrMTP"
),
"MedusaModel"
:
(
"medusa"
,
"Medusa"
),
"MedusaModel"
:
(
"medusa"
,
"Medusa"
),
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
...
...
vllm/model_executor/models/seed_oss.py
View file @
eefa41c1
...
@@ -334,7 +334,7 @@ class SeedOssModel(nn.Module):
...
@@ -334,7 +334,7 @@ class SeedOssModel(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -467,7 +467,7 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -467,7 +467,7 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
eefa41c1
...
@@ -893,7 +893,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -893,7 +893,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/solar.py
View file @
eefa41c1
...
@@ -465,7 +465,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -465,7 +465,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/stablelm.py
View file @
eefa41c1
...
@@ -246,7 +246,7 @@ class StableLMEpochModel(nn.Module):
...
@@ -246,7 +246,7 @@ class StableLMEpochModel(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -332,7 +332,7 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
...
@@ -332,7 +332,7 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/starcoder2.py
View file @
eefa41c1
...
@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):
...
@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
...
@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/step3_text.py
View file @
eefa41c1
...
@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):
...
@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
...
@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
eefa41c1
...
@@ -1105,7 +1105,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -1105,7 +1105,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/tarsier.py
View file @
eefa41c1
...
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/transformers/base.py
View file @
eefa41c1
...
@@ -467,7 +467,7 @@ class Base(
...
@@ -467,7 +467,7 @@ class Base(
# vLLM does not support encoder-decoder models, so if any encoder layer is
# vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model
# found in a text only model, we assume the whole model is an encoder model
if
has_encoder
(
self
.
model
)
and
not
is_multimodal
(
self
.
config
):
if
has_encoder
(
self
.
model
)
and
not
is_multimodal
(
self
.
config
):
self
.
check_version
(
"5.0.0
.dev0
"
,
"encoder models support"
)
self
.
check_version
(
"5.0.0"
,
"encoder models support"
)
attn_type
=
AttentionType
.
ENCODER_ONLY
attn_type
=
AttentionType
.
ENCODER_ONLY
else
:
else
:
attn_type
=
AttentionType
.
DECODER
attn_type
=
AttentionType
.
DECODER
...
...
vllm/model_executor/models/transformers/moe.py
View file @
eefa41c1
...
@@ -118,7 +118,7 @@ direct_register_custom_op(
...
@@ -118,7 +118,7 @@ direct_register_custom_op(
class
MoEMixin
(
MixtureOfExperts
):
class
MoEMixin
(
MixtureOfExperts
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
):
self
.
check_version
(
"5.0.0
.dev0
"
,
"MoE models support"
)
self
.
check_version
(
"5.0.0"
,
"MoE models support"
)
# Skip MixtureOfExperts.__init__ and call the next class in MRO
# Skip MixtureOfExperts.__init__ and call the next class in MRO
super
(
MixtureOfExperts
,
self
).
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
super
(
MixtureOfExperts
,
self
).
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
@@ -336,4 +336,4 @@ class MoEMixin(MixtureOfExperts):
...
@@ -336,4 +336,4 @@ class MoEMixin(MixtureOfExperts):
_recursive_replace
(
self
.
model
,
prefix
=
"model"
)
_recursive_replace
(
self
.
model
,
prefix
=
"model"
)
# Continue with the replacement of layers in Base
# Continue with the replacement of layers in Base
super
().
recursive_replace
()
super
().
recursive_replace
()
\ No newline at end of file
vllm/model_executor/models/ultravox.py
View file @
eefa41c1
...
@@ -725,7 +725,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
...
@@ -725,7 +725,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
torch
.
Tensor
|
None
=
None
,
intermediate_tensors
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
Prev
1
…
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment