Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2f1c19b2
Unverified
Commit
2f1c19b2
authored
Jun 12, 2025
by
Ning Xie
Committed by
GitHub
Jun 11, 2025
Browse files
[CI] change spell checker from codespell to typos (#18711)
Signed-off-by:
Andy Xie
<
andy.xning@gmail.com
>
parent
42f52cc9
Changes
57
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
44 additions
and
44 deletions
+44
-44
vllm/model_executor/layers/quantization/utils/int8_utils.py
vllm/model_executor/layers/quantization/utils/int8_utils.py
+1
-1
vllm/model_executor/model_loader/bitsandbytes_loader.py
vllm/model_executor/model_loader/bitsandbytes_loader.py
+1
-1
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+3
-3
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+3
-3
vllm/model_executor/models/eagle.py
vllm/model_executor/models/eagle.py
+1
-1
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+4
-4
vllm/model_executor/models/llama4.py
vllm/model_executor/models/llama4.py
+1
-1
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mixtral_quant.py
+5
-5
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis.py
+1
-1
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3_small.py
+5
-5
vllm/model_executor/models/phi4mm_audio.py
vllm/model_executor/models/phi4mm_audio.py
+4
-4
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+3
-3
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+1
-1
vllm/transformers_utils/processors/ovis.py
vllm/transformers_utils/processors/ovis.py
+1
-1
vllm/worker/hpu_model_runner.py
vllm/worker/hpu_model_runner.py
+1
-1
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+1
-1
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_model_runner.py
+8
-8
No files found.
vllm/model_executor/layers/quantization/utils/int8_utils.py
View file @
2f1c19b2
...
@@ -219,7 +219,7 @@ def per_token_group_quant_int8(
...
@@ -219,7 +219,7 @@ def per_token_group_quant_int8(
quantized tensor along with the scaling factor used for quantization.
quantized tensor along with the scaling factor used for quantization.
Args:
Args:
x: The input ten
o
sr with ndim >= 2.
x: The input tens
o
r with ndim >= 2.
group_size: The group size used for quantization.
group_size: The group size used for quantization.
eps: The minimum to avoid dividing zero.
eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.int8`
dtype: The dype of output tensor. Note that only `torch.int8`
...
...
vllm/model_executor/model_loader/bitsandbytes_loader.py
View file @
2f1c19b2
...
@@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...
@@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
self
.
target_modules
.
append
(
self
.
target_modules
.
append
(
name
.
replace
(
rep_name
,
sub_name
))
name
.
replace
(
rep_name
,
sub_name
))
# Add original module name even if the module has stacked map,
# Add original module name even if the module has stacked map,
# in case model has a mixture of disk-merged and disk-split
ted
# in case model has a mixture of disk-merged and disk-split
# weights with same last name.
# weights with same last name.
self
.
target_modules
.
append
(
name
)
self
.
target_modules
.
append
(
name
)
...
...
vllm/model_executor/models/baichuan.py
View file @
2f1c19b2
...
@@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
...
@@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
self
.
num_heads
=
(
self
.
total_num_heads
//
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
postion_embedding
=
position_embedding
self
.
pos
i
tion_embedding
=
position_embedding
self
.
rope_theta
=
rope_theta
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
max_position_embeddings
=
max_position_embeddings
...
@@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
...
@@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
)
)
# Create the alibi slopes and slice them.
# Create the alibi slopes and slice them.
if
self
.
postion_embedding
==
"ALIBI"
:
if
self
.
pos
i
tion_embedding
==
"ALIBI"
:
tp_rank
=
get_tensor_model_parallel_rank
()
tp_rank
=
get_tensor_model_parallel_rank
()
head_start
=
tp_rank
*
self
.
num_heads
head_start
=
tp_rank
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
...
@@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
...
@@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
W_pack
(
hidden_states
)
qkv
,
_
=
self
.
W_pack
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
if
self
.
postion_embedding
!=
"ALIBI"
:
if
self
.
pos
i
tion_embedding
!=
"ALIBI"
:
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
output
,
_
=
self
.
o_proj
(
attn_output
)
output
,
_
=
self
.
o_proj
(
attn_output
)
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
2f1c19b2
...
@@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
image_newline
=
nn
.
Parameter
(
self
.
image_newline
=
nn
.
Parameter
(
torch
.
randn
(
self
.
projector_config
.
n_embed
)
*
embed_std
)
torch
.
randn
(
self
.
projector_config
.
n_embed
)
*
embed_std
)
# This is a typo in original implementation
# This is a typo in original implementation
self
.
view_sep
e
rator
=
nn
.
Parameter
(
self
.
view_sep
a
rator
=
nn
.
Parameter
(
torch
.
randn
(
self
.
projector_config
.
n_embed
)
*
embed_std
)
torch
.
randn
(
self
.
projector_config
.
n_embed
)
*
embed_std
)
else
:
else
:
raise
ValueError
(
raise
ValueError
(
...
@@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if
self
.
global_view_pos
==
"head"
:
if
self
.
global_view_pos
==
"head"
:
global_local_features
=
torch
.
cat
([
global_local_features
=
torch
.
cat
([
global_features
,
global_features
,
self
.
view_sep
e
rator
[
None
,
:],
self
.
view_sep
a
rator
[
None
,
:],
local_features
,
local_features
,
])
])
else
:
else
:
global_local_features
=
torch
.
cat
([
global_local_features
=
torch
.
cat
([
local_features
,
local_features
,
self
.
view_sep
e
rator
[
None
,
:],
self
.
view_sep
a
rator
[
None
,
:],
global_features
,
global_features
,
])
])
...
...
vllm/model_executor/models/eagle.py
View file @
2f1c19b2
...
@@ -197,7 +197,7 @@ class EAGLE(nn.Module):
...
@@ -197,7 +197,7 @@ class EAGLE(nn.Module):
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
# This implementation is incomp
ita
ble with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# This implementation is incomp
ati
ble with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# due to missing lm_head weights and its config being that of a
# due to missing lm_head weights and its config being that of a
# Llama model. Here's a compatible version with the same weights:
# Llama model. Here's a compatible version with the same weights:
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
2f1c19b2
...
@@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
kwargs
[
"has_images"
]
=
True
kwargs
[
"has_images"
]
=
True
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
# This is a HACK. Fix this.
# This is a HACK. Fix this.
start_idices
=
(
positions
==
0
).
cpu
().
nonzero
()
start_i
n
dices
=
(
positions
==
0
).
cpu
().
nonzero
()
num_seqs
=
len
(
start_idices
)
num_seqs
=
len
(
start_i
n
dices
)
seq_lens
=
[]
seq_lens
=
[]
for
i
in
range
(
num_seqs
):
for
i
in
range
(
num_seqs
):
start_idx
=
start_idices
[
i
].
item
()
start_idx
=
start_i
n
dices
[
i
].
item
()
if
i
<
num_seqs
-
1
:
if
i
<
num_seqs
-
1
:
end_idx
=
start_idices
[
i
+
1
].
item
()
end_idx
=
start_i
n
dices
[
i
+
1
].
item
()
else
:
else
:
end_idx
=
len
(
input_ids
)
end_idx
=
len
(
input_ids
)
seq_lens
.
append
(
end_idx
-
start_idx
)
seq_lens
.
append
(
end_idx
-
start_idx
)
...
...
vllm/model_executor/models/llama4.py
View file @
2f1c19b2
...
@@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
...
@@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
renormalize
:
bool
,
renormalize
:
bool
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
router_scores
,
router_indices
=
fast_topk
(
gating_output
,
topk
,
dim
=-
1
)
router_scores
,
router_indices
=
fast_topk
(
gating_output
,
topk
,
dim
=-
1
)
# ps
u
edo-standard is that the router scores are floats
# pse
u
do-standard is that the router scores are floats
router_scores
=
torch
.
sigmoid
(
router_scores
.
float
())
router_scores
=
torch
.
sigmoid
(
router_scores
.
float
())
return
(
router_scores
,
router_indices
.
to
(
torch
.
int32
))
return
(
router_scores
,
router_indices
.
to
(
torch
.
int32
))
...
...
vllm/model_executor/models/mixtral_quant.py
View file @
2f1c19b2
...
@@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
...
@@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
f
"Tensor parallel size
{
self
.
tp_size
}
is greater than "
f
"Tensor parallel size
{
self
.
tp_size
}
is greater than "
f
"the number of experts
{
self
.
num_total_experts
}
."
)
f
"the number of experts
{
self
.
num_total_experts
}
."
)
# Split experts equally between ranks
# Split experts equally between ranks
self
.
expert_indic
i
es
=
np
.
array_split
(
range
(
self
.
expert_indices
=
np
.
array_split
(
range
(
self
.
num_total_experts
),
self
.
num_total_experts
),
self
.
tp_size
)[
self
.
rank
].
tolist
()
self
.
tp_size
)[
self
.
rank
].
tolist
()
if
not
self
.
expert_indic
i
es
:
if
not
self
.
expert_indices
:
raise
ValueError
(
raise
ValueError
(
f
"Rank
{
self
.
rank
}
has no experts assigned to it."
)
f
"Rank
{
self
.
rank
}
has no experts assigned to it."
)
...
@@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
...
@@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
config
.
hidden_size
,
config
.
hidden_size
,
config
.
intermediate_size
,
config
.
intermediate_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
)
if
idx
in
self
.
expert_indic
i
es
else
None
if
idx
in
self
.
expert_indices
else
None
for
idx
in
range
(
self
.
num_total_experts
)
for
idx
in
range
(
self
.
num_total_experts
)
])
])
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
...
@@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
...
@@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
final_hidden_states
=
None
final_hidden_states
=
None
for
expert_idx
in
self
.
expert_indic
i
es
:
for
expert_idx
in
self
.
expert_indices
:
expert_layer
=
self
.
experts
[
expert_idx
]
expert_layer
=
self
.
experts
[
expert_idx
]
expert_mask
=
(
selected_experts
==
expert_idx
)
expert_mask
=
(
selected_experts
==
expert_idx
)
expert_weights
=
(
routing_weights
*
expert_mask
).
sum
(
dim
=-
1
,
expert_weights
=
(
routing_weights
*
expert_mask
).
sum
(
dim
=-
1
,
...
...
vllm/model_executor/models/ovis.py
View file @
2f1c19b2
...
@@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
...
@@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
height
,
width
=
self
.
get_hf_processor
().
get_image_size
()
height
,
width
=
self
.
get_hf_processor
().
get_image_size
()
hs
=
self
.
get_hf_config
().
visual_tokenizer_config
.
hidden_stride
hs
=
self
.
get_hf_config
().
visual_tokenizer_config
.
hidden_stride
# NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code
# NOTE(Isotr0py): 9 is `max_parti
ti
on` hardcoded in original code
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
return
ImageSize
(
width
=
width
*
hs
*
9
,
height
=
height
*
hs
*
9
)
return
ImageSize
(
width
=
width
*
hs
*
9
,
height
=
height
*
hs
*
9
)
...
...
vllm/model_executor/models/phi3_small.py
View file @
2f1c19b2
...
@@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
...
@@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
self
.
num_q_per_kv
=
self
.
num_heads
//
self
.
num_key_value_heads
self
.
num_q_per_kv
=
self
.
num_heads
//
self
.
num_key_value_heads
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
assert
self
.
num_key_value_heads
%
self
.
tp_size
==
0
assert
self
.
num_key_value_heads
%
self
.
tp_size
==
0
self
.
num_kv_heads_per_partion
=
max
(
self
.
num_kv_heads_per_parti
ti
on
=
max
(
1
,
self
.
num_key_value_heads
//
self
.
tp_size
)
1
,
self
.
num_key_value_heads
//
self
.
tp_size
)
self
.
num_heads_per_partition
=
self
.
num_heads
//
self
.
tp_size
self
.
num_heads_per_partition
=
self
.
num_heads
//
self
.
tp_size
...
@@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
...
@@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
bs_params
=
{
bs_params
=
{
'max_seqlen'
:
self
.
max_position_embeddings
,
'max_seqlen'
:
self
.
max_position_embeddings
,
'num_heads'
:
self
.
num_heads_per_partition
,
'num_heads'
:
self
.
num_heads_per_partition
,
"num_kv_heads"
:
self
.
num_kv_heads_per_partion
,
"num_kv_heads"
:
self
.
num_kv_heads_per_parti
ti
on
,
"block_size"
:
self
.
sparse_block_size
,
"block_size"
:
self
.
sparse_block_size
,
"local_blocks"
:
self
.
local_blocks
,
"local_blocks"
:
self
.
local_blocks
,
"vert_stride"
:
self
.
vert_stride
,
"vert_stride"
:
self
.
vert_stride
,
...
@@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
...
@@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
self
.
attn
=
Attention
(
self
.
num_heads_per_partition
,
self
.
attn
=
Attention
(
self
.
num_heads_per_partition
,
self
.
head_dim
,
self
.
head_dim
,
self
.
scale
,
self
.
scale
,
num_kv_heads
=
self
.
num_kv_heads_per_partion
,
num_kv_heads
=
self
.
num_kv_heads_per_parti
ti
on
,
cache_config
=
cache_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
blocksparse_params
=
bs_params
,
blocksparse_params
=
bs_params
,
...
@@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
...
@@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
# NOTE: this is required by RotaryEmbed, which indeed does not have to
# NOTE: this is required by RotaryEmbed, which indeed does not have to
# TODO: allow 3D QK for rotary forward
# TODO: allow 3D QK for rotary forward
q
=
q
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_heads_per_partition
)
q
=
q
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_heads_per_partition
)
k
=
k
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partion
)
k
=
k
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_parti
ti
on
)
v
=
v
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partion
)
v
=
v
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_parti
ti
on
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
...
...
vllm/model_executor/models/phi4mm_audio.py
View file @
2f1c19b2
...
@@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module):
...
@@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module):
for the last pointwise conv after swish activation.
for the last pointwise conv after swish activation.
depthwise_seperable_out_channel: int
depthwise_seperable_out_channel: int
if set different to 0, the number of
if set different to 0, the number of
depthwise_seperable_out_channel will be used as a
depthwise_seperable_out_channel will be used as a
channel_out of the second conv1d layer.
channel_out of the second conv1d layer.
otherwise, it equal to 0, the second conv1d layer is skipped.
otherwise, it equal to 0, the second conv1d layer is skipped.
depthwise_multiplier: int
depthwise_multiplier: int
...
@@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
...
@@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
(Multi-Head Attention),
(Multi-Head Attention),
1 = typical Multi-Head Attention,
1 = typical Multi-Head Attention,
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
attn_group_sizes = attenion_heads = Multi-Query Attention
attn_group_sizes = atten
t
ion_heads = Multi-Query Attention
"""
"""
def
__init__
(
def
__init__
(
...
@@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
...
@@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
1 = typical Multi-Head Attention,
1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query
1 < attention_group_size < attention_heads = Grouped-Query
Attention
Attention
attention_group_size = attenion_heads = Multi-Query Attention
attention_group_size = atten
t
ion_heads = Multi-Query Attention
"""
"""
def
__init__
(
def
__init__
(
...
@@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
...
@@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
1 = typical Multi-Head Attention,
1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query
1 < attention_group_size < attention_heads = Grouped-Query
Attention
Attention
attention_group_size = attenion_heads = Multi-Query Attention
attention_group_size = atten
t
ion_heads = Multi-Query Attention
"""
"""
extra_multi_layer_output_idxs
:
list
[
int
]
extra_multi_layer_output_idxs
:
list
[
int
]
...
...
vllm/model_executor/models/phimoe.py
View file @
2f1c19b2
...
@@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
...
@@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
grad_at_output
=
grad_at_output
*
multiplier
grad_at_output
=
grad_at_output
*
multiplier
grad_at_scores_expaned
=
masked_gates
*
grad_at_output
.
mul
(
-
1
)
grad_at_scores_expan
d
ed
=
masked_gates
*
grad_at_output
.
mul
(
-
1
)
grad_at_scores_expaned
.
scatter_add_
(
grad_at_scores_expan
d
ed
.
scatter_add_
(
dim
=-
1
,
dim
=-
1
,
index
=
selected_experts
,
index
=
selected_experts
,
src
=
grad_at_output
,
src
=
grad_at_output
,
)
)
return
(
return
(
grad_at_scores_expaned
,
grad_at_scores_expan
d
ed
,
None
,
None
,
None
,
None
,
None
,
None
,
...
...
vllm/multimodal/utils.py
View file @
2f1c19b2
...
@@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
...
@@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
Returns:
Returns:
list[str]: List of item modalities in order of their positions in the
list[str]: List of item modalities in order of their positions in the
input sequence.
input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
list[PlaceholderRange]: Sorted list of all Placeholde
r
Ranges from
mm_positions.
mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
None otherwise.
None otherwise.
...
...
vllm/transformers_utils/processors/ovis.py
View file @
2f1c19b2
...
@@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
...
@@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
"""
"""
attributes
=
[
"image_processor"
,
"tokenizer"
]
attributes
=
[
"image_processor"
,
"tokenizer"
]
valid_kwargs
=
[
"chat_template"
,
"image_pad_token"
,
"image_seg
e
ment_len"
]
valid_kwargs
=
[
"chat_template"
,
"image_pad_token"
,
"image_segment_len"
]
image_processor_class
=
"AutoImageProcessor"
image_processor_class
=
"AutoImageProcessor"
tokenizer_class
=
"AutoTokenizer"
tokenizer_class
=
"AutoTokenizer"
...
...
vllm/worker/hpu_model_runner.py
View file @
2f1c19b2
...
@@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
...
@@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
num_decode_tokens
=
0
,
num_decode_tokens
=
0
,
slot_mapping
=
slot_mapping
,
slot_mapping
=
slot_mapping
,
multi_modal_placeholder_index_maps
=
multi_modal_placeholder_index_maps
=
None
,
# FIXME(kzawora): mu
t
li-modality will not work here
None
,
# FIXME(kzawora): mul
t
i-modality will not work here
enable_kv_scales_calculation
=
False
,
enable_kv_scales_calculation
=
False
,
)
)
multi_modal_kwargs
=
MultiModalKwargs
.
batch
(
multi_modal_kwargs_list
)
multi_modal_kwargs
=
MultiModalKwargs
.
batch
(
multi_modal_kwargs_list
)
...
...
vllm/worker/multi_step_model_runner.py
View file @
2f1c19b2
...
@@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
...
@@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
assert
fmi
.
input_tokens
.
shape
[
0
]
>=
self
.
num_seqs
assert
fmi
.
input_tokens
.
shape
[
0
]
>=
self
.
num_seqs
fmi_new_input_tokens
:
torch
.
Tensor
=
fmi
.
input_tokens
[:
self
.
num_seqs
]
fmi_new_input_tokens
:
torch
.
Tensor
=
fmi
.
input_tokens
[:
self
.
num_seqs
]
# Update frozen_model_input::input_positons.
# Update frozen_model_input::input_posit
i
ons.
assert
fmi
.
input_positions
is
not
None
assert
fmi
.
input_positions
is
not
None
assert
fmi
.
input_positions
.
shape
[
0
]
>=
self
.
num_seqs
assert
fmi
.
input_positions
.
shape
[
0
]
>=
self
.
num_seqs
fmi_new_input_positions
:
torch
.
Tensor
=
fmi
.
input_positions
[:
self
.
fmi_new_input_positions
:
torch
.
Tensor
=
fmi
.
input_positions
[:
self
.
...
...
vllm/worker/tpu_model_runner.py
View file @
2f1c19b2
...
@@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
...
@@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
"""
"""
batch_size
,
seq_len
=
token_ids
.
shape
batch_size
,
seq_len
=
token_ids
.
shape
# Calculate the positions to sample from.
# Calculate the positions to sample from.
start_indic
i
es
=
torch
.
arange
(
start_indices
=
torch
.
arange
(
batch_size
,
dtype
=
torch
.
int32
,
device
=
input_lens
.
device
)
*
seq_len
batch_size
,
dtype
=
torch
.
int32
,
device
=
input_lens
.
device
)
*
seq_len
logits_indices
=
start_indic
i
es
+
input_lens
-
1
logits_indices
=
start_indices
+
input_lens
-
1
attn_metadata
=
get_forward_context
().
attn_metadata
attn_metadata
=
get_forward_context
().
attn_metadata
# FIXME(woosuk): This is a temporary hack to avoid using the existing
# FIXME(woosuk): This is a temporary hack to avoid using the existing
...
@@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
...
@@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
num_kv_heads
,
num_blocks
,
block_size
,
_
=
kv_caches
[
0
][
0
].
shape
num_kv_heads
,
num_blocks
,
block_size
,
_
=
kv_caches
[
0
][
0
].
shape
slot_mapping
=
attn_metadata
.
slot_mapping
slot_mapping
=
attn_metadata
.
slot_mapping
slot_mapping
=
slot_mapping
.
flatten
()
slot_mapping
=
slot_mapping
.
flatten
()
head_indic
i
es
=
torch
.
arange
(
0
,
head_indices
=
torch
.
arange
(
0
,
num_kv_heads
,
num_kv_heads
,
device
=
slot_mapping
.
device
,
device
=
slot_mapping
.
device
,
dtype
=
slot_mapping
.
dtype
)
dtype
=
slot_mapping
.
dtype
)
head_indic
i
es
*=
block_size
*
num_blocks
head_indices
*=
block_size
*
num_blocks
slot_mapping
=
slot_mapping
.
repeat_interleave
(
num_kv_heads
).
view
(
slot_mapping
=
slot_mapping
.
repeat_interleave
(
num_kv_heads
).
view
(
-
1
,
num_kv_heads
)
-
1
,
num_kv_heads
)
slot_mapping
=
slot_mapping
+
head_indic
i
es
.
view
(
1
,
-
1
)
slot_mapping
=
slot_mapping
+
head_indices
.
view
(
1
,
-
1
)
slot_mapping
=
slot_mapping
.
flatten
()
slot_mapping
=
slot_mapping
.
flatten
()
attn_metadata
.
slot_mapping
=
slot_mapping
attn_metadata
.
slot_mapping
=
slot_mapping
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment