Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da93439
Commit
0da93439
authored
Mar 26, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori
parents
25f2f756
298e5108
Changes
613
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
482 additions
and
1207 deletions
+482
-1207
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+52
-46
vllm/model_executor/models/roberta.py
vllm/model_executor/models/roberta.py
+16
-10
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+37
-530
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+27
-458
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+2
-24
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+8
-5
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_causal.py
+4
-2
vllm/model_executor/offloader/prefetch.py
vllm/model_executor/offloader/prefetch.py
+42
-1
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/deep_gemm_warmup.py
+4
-2
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+87
-9
vllm/multimodal/media/audio.py
vllm/multimodal/media/audio.py
+98
-61
vllm/multimodal/media/video.py
vllm/multimodal/media/video.py
+17
-5
vllm/multimodal/parse.py
vllm/multimodal/parse.py
+1
-1
vllm/multimodal/processing/processor.py
vllm/multimodal/processing/processor.py
+2
-0
vllm/parser/abstract_parser.py
vllm/parser/abstract_parser.py
+20
-10
vllm/parser/parser_manager.py
vllm/parser/parser_manager.py
+1
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+3
-0
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+39
-16
vllm/platforms/interface.py
vllm/platforms/interface.py
+7
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+15
-26
No files found.
Too many changes to show.
To preserve performance only
613 of 613+
files are displayed.
Plain diff
Email patch
vllm/model_executor/models/registry.py
View file @
0da93439
...
...
@@ -124,8 +124,8 @@ _TEXT_GENERATION_MODELS = {
"GPTNeoXForCausalLM"
:
(
"gpt_neox"
,
"GPTNeoXForCausalLM"
),
"GraniteForCausalLM"
:
(
"granite"
,
"GraniteForCausalLM"
),
"GraniteMoeForCausalLM"
:
(
"granitemoe"
,
"GraniteMoeForCausalLM"
),
"GraniteMoeHybridForCausalLM"
:
(
"granitemoehybrid"
,
"GraniteMoeHybridForCausalLM"
),
# noqa: E501
"GraniteMoeSharedForCausalLM"
:
(
"granitemoeshared"
,
"GraniteMoeSharedForCausalLM"
),
# noqa: E501
"GraniteMoeHybridForCausalLM"
:
(
"granitemoehybrid"
,
"GraniteMoeHybridForCausalLM"
),
"GraniteMoeSharedForCausalLM"
:
(
"granitemoeshared"
,
"GraniteMoeSharedForCausalLM"
),
"GritLM"
:
(
"gritlm"
,
"GritLM"
),
"Grok1ModelForCausalLM"
:
(
"grok1"
,
"GrokForCausalLM"
),
"Grok1ForCausalLM"
:
(
"grok1"
,
"GrokForCausalLM"
),
...
...
@@ -143,7 +143,7 @@ _TEXT_GENERATION_MODELS = {
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"Jais2ForCausalLM"
:
(
"jais2"
,
"Jais2ForCausalLM"
),
"JambaForCausalLM"
:
(
"jamba"
,
"JambaForCausalLM"
),
"KimiLinearForCausalLM"
:
(
"kimi_linear"
,
"KimiLinearForCausalLM"
),
# noqa: E501
"KimiLinearForCausalLM"
:
(
"kimi_linear"
,
"KimiLinearForCausalLM"
),
"Lfm2ForCausalLM"
:
(
"lfm2"
,
"Lfm2ForCausalLM"
),
"Lfm2MoeForCausalLM"
:
(
"lfm2_moe"
,
"Lfm2MoeForCausalLM"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
...
...
@@ -249,17 +249,14 @@ _EMBEDDING_MODELS = {
# [Multimodal]
"CLIPModel"
:
(
"clip"
,
"CLIPEmbeddingModel"
),
"ColPaliForRetrieval"
:
(
"colpali"
,
"ColPaliModel"
),
"LlamaNemotronVLModel"
:
(
"nemotron_vl"
,
"LlamaNemotronVLForEmbedding"
),
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
,
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
"SiglipModel"
:
(
"siglip"
,
"SiglipEmbeddingModel"
),
"LlamaNemotronVLModel"
:
(
"nemotron_vl"
,
"LlamaNemotronVLForEmbedding"
,
),
# Technically Terratorch models work on images, both in
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
...
...
@@ -272,10 +269,13 @@ _LATE_INTERACTION_MODELS = {
"HF_ColBERT"
:
(
"colbert"
,
"ColBERTModel"
),
"ColBERTModernBertModel"
:
(
"colbert"
,
"ColBERTModernBertModel"
),
"ColBERTJinaRobertaModel"
:
(
"colbert"
,
"ColBERTJinaRobertaModel"
),
"ColBERTLfm2Model"
:
(
"colbert"
,
"ColBERTLfm2Model"
),
# [Multimodal]
"ColModernVBertForRetrieval"
:
(
"colmodernvbert"
,
"ColModernVBertForRetrieval"
),
"ColPaliForRetrieval"
:
(
"colpali"
,
"ColPaliModel"
),
"ColQwen3"
:
(
"colqwen3"
,
"ColQwen3Model"
),
"OpsColQwen3Model"
:
(
"colqwen3"
,
"ColQwen3Model"
),
"ColQwen3_5"
:
(
"colqwen3_5"
,
"ColQwen3_5Model"
),
"Qwen3VLNemotronEmbedModel"
:
(
"colqwen3"
,
"ColQwen3Model"
),
}
...
...
@@ -302,7 +302,7 @@ _SEQUENCE_CLASSIFICATION_MODELS = {
"bert_with_rope"
,
"GteNewForSequenceClassification"
,
),
"JambaForSequenceClassification"
:
(
"jamba"
,
"JambaForSequenceClassification"
),
# noqa: E501
"JambaForSequenceClassification"
:
(
"jamba"
,
"JambaForSequenceClassification"
),
"LlamaBidirectionalForSequenceClassification"
:
(
"llama"
,
"LlamaBidirectionalForSequenceClassification"
,
...
...
@@ -366,13 +366,13 @@ _MULTIMODAL_MODELS = {
"fireredasr2"
,
"FireRedASR2ForConditionalGeneration"
,
),
"FunASRForConditionalGeneration"
:
(
"funasr"
,
"FunASRForConditionalGeneration"
),
# noqa: E501
"FunASRForConditionalGeneration"
:
(
"funasr"
,
"FunASRForConditionalGeneration"
),
"FunAudioChatForConditionalGeneration"
:
(
"funaudiochat"
,
"FunAudioChatForConditionalGeneration"
,
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"Gemma3ForConditionalGeneration"
:
(
"gemma3_mm"
,
"Gemma3ForConditionalGeneration"
),
# noqa: E501
"Gemma3ForConditionalGeneration"
:
(
"gemma3_mm"
,
"Gemma3ForConditionalGeneration"
),
"Gemma3nForConditionalGeneration"
:
(
"gemma3n_mm"
,
"Gemma3nForConditionalGeneration"
,
...
...
@@ -381,7 +381,7 @@ _MULTIMODAL_MODELS = {
"GLM4VForCausalLM"
:
(
"glm4v"
,
"GLM4VForCausalLM"
),
"Glm4vForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vForConditionalGeneration"
),
"Glm4vMoeForConditionalGeneration"
:
(
"glm4_1v"
,
"Glm4vMoeForConditionalGeneration"
),
"GlmOcrForConditionalGeneration"
:
(
"glm_ocr"
,
"GlmOcrForConditionalGeneration"
),
# noqa: E501
"GlmOcrForConditionalGeneration"
:
(
"glm_ocr"
,
"GlmOcrForConditionalGeneration"
),
"GraniteSpeechForConditionalGeneration"
:
(
"granite_speech"
,
"GraniteSpeechForConditionalGeneration"
,
...
...
@@ -391,13 +391,7 @@ _MULTIMODAL_MODELS = {
"hunyuan_vision"
,
"HunYuanVLForConditionalGeneration"
,
),
"StepVLForConditionalGeneration"
:
(
"step_vl"
,
"StepVLForConditionalGeneration"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"NemotronH_Nano_VL_V2"
:
(
"nano_nemotron_vl"
,
"NemotronH_Nano_VL_V2"
),
"OpenCUAForConditionalGeneration"
:
(
"opencua"
,
"OpenCUAForConditionalGeneration"
,
),
"InternS1ForConditionalGeneration"
:
(
"interns1"
,
"InternS1ForConditionalGeneration"
,
...
...
@@ -415,24 +409,22 @@ _MULTIMODAL_MODELS = {
"Idefics3ForConditionalGeneration"
,
),
"IsaacForConditionalGeneration"
:
(
"isaac"
,
"IsaacForConditionalGeneration"
),
"SmolVLMForConditionalGeneration"
:
(
"smolvlm"
,
"SmolVLMForConditionalGeneration"
),
# noqa: E501
"KananaVForConditionalGeneration"
:
(
"kanana_v"
,
"KananaVForConditionalGeneration"
),
"KeyeForConditionalGeneration"
:
(
"keye"
,
"KeyeForConditionalGeneration"
),
"KeyeVL1_5ForConditionalGeneration"
:
(
"keye_vl1_5"
,
"KeyeVL1_5ForConditionalGeneration"
,
),
"RForConditionalGeneration"
:
(
"rvl"
,
"RForConditionalGeneration"
),
"KimiVLForConditionalGeneration"
:
(
"kimi_vl"
,
"KimiVLForConditionalGeneration"
),
# noqa: E501
"KimiK25ForConditionalGeneration"
:
(
"kimi_k25"
,
"KimiK25ForConditionalGeneration"
),
# noqa: E501
"MoonshotKimiaForCausalLM"
:
(
"kimi_audio"
,
"KimiAudioForConditionalGeneration"
),
# noqa: E501
"KimiVLForConditionalGeneration"
:
(
"kimi_vl"
,
"KimiVLForConditionalGeneration"
),
"KimiK25ForConditionalGeneration"
:
(
"kimi_k25"
,
"KimiK25ForConditionalGeneration"
),
"MoonshotKimiaForCausalLM"
:
(
"kimi_audio"
,
"KimiAudioForConditionalGeneration"
),
"LightOnOCRForConditionalGeneration"
:
(
"lightonocr"
,
"LightOnOCRForConditionalGeneration"
,
),
"Lfm2VlForConditionalGeneration"
:
(
"lfm2_vl"
,
"Lfm2VLForConditionalGeneration"
),
"Llama4ForConditionalGeneration"
:
(
"mllama4"
,
"Llama4ForConditionalGeneration"
),
"Llama_Nemotron_Nano_VL"
:
(
"nemotron_vl"
,
"LlamaNemotronVLChatModel"
),
"Llama4ForConditionalGeneration"
:
(
"mllama4"
,
"Llama4ForConditionalGeneration"
),
# noqa: E501
"LlavaForConditionalGeneration"
:
(
"llava"
,
"LlavaForConditionalGeneration"
),
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
...
...
@@ -446,7 +438,7 @@ _MULTIMODAL_MODELS = {
"llava_onevision"
,
"LlavaOnevisionForConditionalGeneration"
,
),
"MantisForConditionalGeneration"
:
(
"llava"
,
"MantisForConditionalGeneration"
),
# noqa: E501
"MantisForConditionalGeneration"
:
(
"llava"
,
"MantisForConditionalGeneration"
),
"MiDashengLMModel"
:
(
"midashenglm"
,
"MiDashengLMModel"
),
"MiniMaxVL01ForConditionalGeneration"
:
(
"minimax_vl_01"
,
...
...
@@ -460,7 +452,9 @@ _MULTIMODAL_MODELS = {
),
"MolmoForCausalLM"
:
(
"molmo"
,
"MolmoForCausalLM"
),
"Molmo2ForConditionalGeneration"
:
(
"molmo2"
,
"Molmo2ForConditionalGeneration"
),
"NemotronH_Nano_VL_V2"
:
(
"nano_nemotron_vl"
,
"NemotronH_Nano_VL_V2"
),
"NVLM_D"
:
(
"nvlm_d"
,
"NVLM_D_Model"
),
"OpenCUAForConditionalGeneration"
:
(
"opencua"
,
"OpenCUAForConditionalGeneration"
),
"OpenPanguVLForConditionalGeneration"
:
(
"openpangu_vl"
,
"OpenPanguVLForConditionalGeneration"
,
...
...
@@ -479,9 +473,9 @@ _MULTIMODAL_MODELS = {
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Phi4MMForCausalLM"
:
(
"phi4mm"
,
"Phi4MMForCausalLM"
),
"PixtralForConditionalGeneration"
:
(
"pixtral"
,
"PixtralForConditionalGeneration"
),
# noqa: E501
"QwenVLForConditionalGeneration"
:
(
"qwen_vl"
,
"QwenVLForConditionalGeneration"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"PixtralForConditionalGeneration"
:
(
"pixtral"
,
"PixtralForConditionalGeneration"
),
"QwenVLForConditionalGeneration"
:
(
"qwen_vl"
,
"QwenVLForConditionalGeneration"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
"Qwen2_5_VLForConditionalGeneration"
:
(
"qwen2_5_vl"
,
"Qwen2_5_VLForConditionalGeneration"
,
...
...
@@ -506,39 +500,40 @@ _MULTIMODAL_MODELS = {
"qwen3_asr"
,
"Qwen3ASRForConditionalGeneration"
,
),
"Qwen3ASRRealtimeGeneration"
:
(
"qwen3_asr_realtime"
,
"Qwen3ASRRealtimeGeneration"
,
),
"Qwen3VLForConditionalGeneration"
:
(
"qwen3_vl"
,
"Qwen3VLForConditionalGeneration"
),
# noqa: E501
"Qwen3ASRRealtimeGeneration"
:
(
"qwen3_asr_realtime"
,
"Qwen3ASRRealtimeGeneration"
),
"Qwen3VLForConditionalGeneration"
:
(
"qwen3_vl"
,
"Qwen3VLForConditionalGeneration"
),
"Qwen3VLMoeForConditionalGeneration"
:
(
"qwen3_vl_moe"
,
"Qwen3VLMoeForConditionalGeneration"
,
),
"Qwen3_5ForConditionalGeneration"
:
(
"qwen3_5"
,
"Qwen3_5ForConditionalGeneration"
,
),
"Qwen3_5ForConditionalGeneration"
:
(
"qwen3_5"
,
"Qwen3_5ForConditionalGeneration"
),
"Qwen3_5MoeForConditionalGeneration"
:
(
"qwen3_5"
,
"Qwen3_5MoeForConditionalGeneration"
,
),
"RForConditionalGeneration"
:
(
"rvl"
,
"RForConditionalGeneration"
),
"SkyworkR1VChatModel"
:
(
"skyworkr1v"
,
"SkyworkR1VChatModel"
),
"Step3VLForConditionalGeneration"
:
(
"step3_vl"
,
"Step3VLForConditionalGeneration"
),
# noqa: E501
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
# noqa: E501
"SmolVLMForConditionalGeneration"
:
(
"smolvlm"
,
"SmolVLMForConditionalGeneration"
),
"StepVLForConditionalGeneration"
:
(
"step_vl"
,
"StepVLForConditionalGeneration"
),
"Step3VLForConditionalGeneration"
:
(
"step3_vl"
,
"Step3VLForConditionalGeneration"
),
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
"Tarsier2ForConditionalGeneration"
:
(
"qwen2_vl"
,
"Tarsier2ForConditionalGeneration"
,
),
"UltravoxModel"
:
(
"ultravox"
,
"UltravoxModel"
),
"VoxtralForConditionalGeneration"
:
(
"voxtral"
,
"VoxtralForConditionalGeneration"
),
# noqa: E501
"VoxtralRealtimeGeneration"
:
(
"voxtral_realtime"
,
"VoxtralRealtimeGeneration"
),
# noqa: E501
"VoxtralForConditionalGeneration"
:
(
"voxtral"
,
"VoxtralForConditionalGeneration"
),
"VoxtralRealtimeGeneration"
:
(
"voxtral_realtime"
,
"VoxtralRealtimeGeneration"
),
# [Encoder-decoder]
"CohereASRForConditionalGeneration"
:
(
"cohere_asr"
,
"CohereASRForConditionalGeneration"
,
),
"NemotronParseForConditionalGeneration"
:
(
"nemotron_parse"
,
"NemotronParseForConditionalGeneration"
,
),
"WhisperForConditionalGeneration"
:
(
"whisper"
,
"WhisperForConditionalGeneration"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
(
"whisper"
,
"WhisperForConditionalGeneration"
),
}
_SPECULATIVE_DECODING_MODELS
=
{
...
...
@@ -648,14 +643,17 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
"Phi4MultimodalForCausalLM"
:
"0.12.0"
,
# encoder-decoder models except whisper
# have been removed for V0 deprecation.
"BartModel"
:
"0.10.2"
,
"BartForConditionalGeneration"
:
"0.10.2"
,
"DonutForConditionalGeneration"
:
"0.10.2"
,
"Florence2ForConditionalGeneration"
:
"0.10.2"
,
"MBartForConditionalGeneration"
:
"0.10.2"
,
"MllamaForConditionalGeneration"
:
"0.10.2"
,
}
_OOT_SUPPORTED_MODELS
=
{
"BartModel"
:
"https://github.com/vllm-project/bart-plugin"
,
"BartForConditionalGeneration"
:
"https://github.com/vllm-project/bart-plugin"
,
"Florence2ForConditionalGeneration"
:
"https://github.com/vllm-project/bart-plugin"
,
"MBartForConditionalGeneration"
:
"https://github.com/vllm-project/bart-plugin"
,
}
@
dataclass
(
frozen
=
True
)
class
_ModelInfo
:
...
...
@@ -952,6 +950,14 @@ class _ModelRegistry:
"Please use an older version of vLLM if you want to "
"use this model architecture."
)
if
arch
in
_OOT_SUPPORTED_MODELS
:
plugin_url
=
_OOT_SUPPORTED_MODELS
[
arch
]
raise
ValueError
(
f
"Model architecture
{
arch
}
is not supported in-tree anymore. "
f
"Please install the plugin at
{
plugin_url
}
if you want to "
"use this model architecture."
)
raise
ValueError
(
f
"Model architectures
{
architectures
}
are not supported for now. "
...
...
vllm/model_executor/models/roberta.py
View file @
0da93439
...
...
@@ -10,6 +10,7 @@ from transformers import RobertaConfig
from
vllm.config
import
ModelConfig
,
PoolerConfig
,
VllmConfig
from
vllm.model_executor.layers.pooler
import
(
BgeM3Pooler
,
BOSEOSFilter
,
DispatchPooler
,
Pooler
,
...
...
@@ -216,24 +217,29 @@ class BgeM3EmbeddingModel(RobertaEmbeddingModel):
self
.
colbert_linear
=
nn
.
Linear
(
self
.
hidden_size
,
self
.
hidden_size
,
dtype
=
self
.
head_dtype
)
embed_pooler
=
pooler_for_embed
(
pooler_config
)
token_classify_pooler
=
BOSEOSFilter
(
pooler_for_token_classify
(
pooler_config
,
pooling
=
AllPool
(),
classifier
=
self
.
sparse_linear
,
act_fn
=
torch
.
relu
,
),
self
.
bos_token_id
,
self
.
eos_token_id
,
)
return
DispatchPooler
(
{
"embed"
:
pooler_for_
embed
(
pooler
_config
)
,
"embed"
:
embed
_
pooler
,
"token_embed"
:
BOSEOSFilter
(
pooler_for_token_embed
(
pooler_config
,
self
.
colbert_linear
),
self
.
bos_token_id
,
# for some reason m3 only filters the bos for colbert vectors
),
"token_classify"
:
BOSEOSFilter
(
pooler_for_token_classify
(
pooler_config
,
pooling
=
AllPool
(),
classifier
=
self
.
sparse_linear
,
act_fn
=
torch
.
relu
,
),
self
.
bos_token_id
,
self
.
eos_token_id
,
"token_classify"
:
token_classify_pooler
,
"embed&token_classify"
:
BgeM3Pooler
(
token_classify_pooler
,
embed_pooler
),
}
)
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
0da93439
...
...
@@ -7,14 +7,12 @@
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
from
typing
import
Annotated
,
Literal
,
TypeAlias
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
PretrainedConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
...
...
@@ -26,40 +24,23 @@ from vllm.model_executor.models.intern_vit import (
InternVisionPatchModel
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
)
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
,
)
from
vllm.multimodal.processing
import
(
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
)
from
vllm.multimodal.inputs
import
MultiModalDataDict
from
vllm.multimodal.processing
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processors.internvl
import
(
InternVLImageProcessor
,
InternVLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.internvl
import
(
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
)
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
class
SkyworkR1VImagePixelInputs
(
TensorSchema
):
"""
...
...
@@ -106,418 +87,36 @@ SkyworkR1VImageInputs: TypeAlias = (
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
class
SkyworkR1VProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
text_inputs
=
self
.
tokenizer
(
text
)
return
InternVLImageProcessor
(
**
kwargs
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
InternVLProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
class
SkyworkR1VProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
SkyworkR1VProcessor
:
return
self
.
ctx
.
init_processor
(
SkyworkR1VProcessor
,
config
=
self
.
get_hf_config
(),
return
InternVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
}
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
processor
:
SkyworkR1VProcessor
,
)
->
int
:
return
processor
.
get_num_image_tokens
(
image_width
=
image_width
,
image_height
=
image_height
,
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
processor
=
self
.
get_hf_processor
()
base_size
=
processor
.
image_size
target_ratios
=
processor
.
resolve_target_ratios
()
largest_feature_size
,
largest_feature_pinpoint
=
0
,
None
for
wr
,
hr
in
target_ratios
:
width
,
height
=
base_size
*
wr
,
base_size
*
hr
feat_size
=
self
.
get_num_image_tokens
(
image_width
=
width
,
image_height
=
height
,
processor
=
processor
,
)
if
feat_size
>
largest_feature_size
:
largest_feature_size
=
feat_size
largest_feature_pinpoint
=
ImageSize
(
width
=
width
,
height
=
height
)
if
largest_feature_size
==
0
or
largest_feature_pinpoint
is
None
:
raise
ValueError
(
"Cannot have a largest feature size of 0!"
)
return
largest_feature_pinpoint
class
SkyworkR1VDummyInputsBuilder
(
BaseDummyInputsBuilder
[
SkyworkR1VProcessingInfo
]):
def
get_dummy_text
(
self
,
mm_counts
:
Mapping
[
str
,
int
])
->
str
:
...
...
@@ -546,102 +145,10 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
}
class
SkyworkR1VMultiModalProcessor
(
BaseMultiModalProcessor
[
SkyworkR1VProcessingInfo
]):
def
_call_hf_processor
(
self
,
prompt
:
str
,
mm_data
:
Mapping
[
str
,
object
],
mm_kwargs
:
Mapping
[
str
,
object
],
tok_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
processed_outputs
=
super
().
_call_hf_processor
(
prompt
=
prompt
,
mm_data
=
mm_data
,
mm_kwargs
=
mm_kwargs
,
tok_kwargs
=
tok_kwargs
,
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
image_token_id
=
hf_processor
.
image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
# tokens to merge from the vision encoder outputs
processed_outputs
[
"image_token_id"
]
=
torch
.
tensor
(
image_token_id
)
return
processed_outputs
def
_get_mm_fields_config
(
self
,
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
image_num_patches
=
hf_inputs
.
get
(
"image_num_patches"
,
torch
.
empty
(
0
))
num_images
=
len
(
image_num_patches
)
return
dict
(
pixel_values_flat
=
MultiModalFieldConfig
.
flat_from_sizes
(
"image"
,
image_num_patches
),
image_num_patches
=
MultiModalFieldConfig
.
batched
(
"image"
),
image_embeds
=
MultiModalFieldConfig
.
batched
(
"image"
),
image_token_id
=
MultiModalFieldConfig
.
shared
(
"image"
,
num_images
),
)
def
_get_prompt_updates
(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
out_mm_kwargs
:
MultiModalKwargsItems
,
)
->
Sequence
[
PromptUpdate
]:
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
out_mm_data
=
out_mm_kwargs
.
get_data
()
if
"image_num_patches"
in
out_mm_data
:
image_num_patches
=
out_mm_data
[
"image_num_patches"
]
assert
isinstance
(
image_num_patches
,
torch
.
Tensor
)
image_num_patches
=
image_num_patches
.
tolist
()
elif
"image_embeds"
in
out_mm_data
:
# TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL)
image_num_patches
=
[
None
]
*
len
(
out_mm_data
[
"image_embeds"
])
else
:
image_num_patches
=
[]
def
get_replacement_skyworkr1v
(
item_idx
:
int
):
images
=
mm_items
.
get_items
(
"image"
,
(
ImageEmbeddingItems
,
ImageProcessorItems
)
)
if
isinstance
(
images
,
ImageEmbeddingItems
):
feature_size
=
images
.
get_feature_size
(
item_idx
)
else
:
image_size
=
images
.
get_image_size
(
item_idx
)
feature_size
=
self
.
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
,
processor
=
hf_processor
,
)
num_patches
=
image_num_patches
[
item_idx
]
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
[
PromptReplacement
(
modality
=
"image"
,
target
=
"<image>"
,
replacement
=
get_replacement_skyworkr1v
,
)
]
@
MULTIMODAL_REGISTRY
.
register_processor
(
SkyworkR1V
MultiModalProcessor
,
BaseInternVL
MultiModalProcessor
,
info
=
SkyworkR1VProcessingInfo
,
dummy_inputs
=
SkyworkR1V
DummyInputsBuilder
,
dummy_inputs
=
BaseInternVL
DummyInputsBuilder
,
)
class
SkyworkR1VChatModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
@
classmethod
...
...
vllm/model_executor/models/step3_vl.py
View file @
0da93439
...
...
@@ -2,18 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
itertools
import
product
from
math
import
ceil
,
sqrt
from
math
import
sqrt
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
PIL
import
Image
from
torchvision
import
transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
...
...
@@ -43,8 +38,12 @@ from vllm.multimodal.processing import (
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.configs
import
Step3VisionEncoderConfig
from
vllm.transformers_utils.configs.step3_vl
import
Step3VisionEncoderConfig
from
vllm.transformers_utils.processors.step3_vl
import
(
MAX_IMAGE_SIZE
,
Step3VLImageProcessor
,
Step3VLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
...
...
@@ -89,447 +88,32 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
Step3VLImageInputs
:
TypeAlias
=
Step3VLImagePixelInputs
|
Step3VLImageEmbeddingInputs
ImageWithPatches
=
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]
MAX_IMAGE_SIZE
:
int
=
3024
class
Step3VisionProcessor
:
def
__init__
(
self
,
size
,
interpolation_mode
=
"bicubic"
,
patch_size
=
None
):
mean
=
[
0.48145466
,
0.4578275
,
0.40821073
]
std
=
[
0.26862954
,
0.26130258
,
0.27577711
]
patch_size
=
patch_size
if
patch_size
is
not
None
else
size
self
.
transform
=
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
size
,
size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
self
.
patch_transform
=
(
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
patch_size
,
patch_size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
if
patch_size
is
not
None
else
None
)
def
__call__
(
self
,
image
,
is_patch
=
False
):
if
is_patch
:
return
{
"pixel_values"
:
self
.
patch_transform
(
image
).
unsqueeze
(
0
)}
else
:
return
{
"pixel_values"
:
self
.
transform
(
image
).
unsqueeze
(
0
)}
class
ImagePatcher
:
def
__init__
(
self
,
enable_patch
:
bool
=
True
)
->
None
:
self
.
enable_patch
=
enable_patch
def
determine_window_size
(
self
,
long
:
int
,
short
:
int
)
->
int
:
if
long
<
728
:
return
short
if
long
/
short
>
1.5
else
0
return
min
(
short
,
504
)
if
long
/
short
>
4
else
504
def
slide_window
(
self
,
width
:
int
,
height
:
int
,
sizes
:
list
[
tuple
[
int
,
int
]],
steps
:
list
[
tuple
[
int
,
int
]],
img_rate_thr
:
float
=
0.6
,
)
->
tuple
[
list
[
tuple
[
int
,
int
,
int
,
int
]],
tuple
[
int
,
int
]]:
assert
1
>=
img_rate_thr
>=
0
,
"The `in_rate_thr` should lie in 0~1"
windows
=
[]
# Sliding windows.
for
size
,
step
in
zip
(
sizes
,
steps
):
size_w
,
size_h
=
size
step_w
,
step_h
=
step
x_num
=
1
if
width
<=
size_w
else
ceil
((
width
-
size_w
)
/
step_w
+
1
)
x_start
=
[
step_w
*
i
for
i
in
range
(
x_num
)]
if
len
(
x_start
)
>
1
and
x_start
[
-
1
]
+
size_w
>
width
:
x_start
[
-
1
]
=
width
-
size_w
y_num
=
1
if
height
<=
size_h
else
ceil
((
height
-
size_h
)
/
step_h
+
1
)
y_start
=
[
step_h
*
i
for
i
in
range
(
y_num
)]
if
len
(
y_start
)
>
1
and
y_start
[
-
1
]
+
size_h
>
height
:
y_start
[
-
1
]
=
height
-
size_h
start
=
np
.
array
(
list
(
product
(
y_start
,
x_start
)),
dtype
=
int
)
start
[:,
[
0
,
1
]]
=
start
[:,
[
1
,
0
]]
windows
.
append
(
np
.
concatenate
([
start
,
start
+
size
],
axis
=
1
))
windows
=
np
.
concatenate
(
windows
,
axis
=
0
)
return
[
(
int
(
box
[
0
]),
int
(
box
[
1
]),
int
(
box
[
2
]
-
box
[
0
]),
int
(
box
[
3
]
-
box
[
1
]))
for
box
in
windows
],
(
x_num
,
y_num
)
def
square_pad
(
self
,
img
:
Image
.
Image
)
->
Image
.
Image
:
w
,
h
=
img
.
size
if
w
==
h
:
return
img
size
=
max
(
w
,
h
)
padded
=
Image
.
new
(
img
.
mode
,
(
size
,
size
),
0
)
padded
.
paste
(
img
,
(
0
,
0
))
return
padded
def
get_image_size_for_padding
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
ratio
=
img_width
/
img_height
if
min
(
img_height
,
img_width
)
<
32
and
(
ratio
>
4
or
ratio
<
1
/
4
):
new_size
=
max
(
img_height
,
img_width
)
return
new_size
,
new_size
return
img_width
,
img_height
def
get_image_size_for_preprocess
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
if
max
(
img_height
,
img_width
)
>
MAX_IMAGE_SIZE
:
scale_factor
=
MAX_IMAGE_SIZE
/
max
(
img_height
,
img_width
)
img_width
=
int
(
img_width
*
scale_factor
)
img_height
=
int
(
img_height
*
scale_factor
)
return
img_width
,
img_height
def
get_image_size_for_crop
(
self
,
img_width
:
int
,
img_height
:
int
,
window_size
:
int
):
w_ratio
=
img_width
/
window_size
h_ratio
=
img_height
/
window_size
if
w_ratio
<
1
:
width_new
=
img_width
else
:
decimal_w
=
w_ratio
-
img_width
//
window_size
w_ratio
=
int
(
w_ratio
)
+
1
if
decimal_w
>
0.2
else
int
(
w_ratio
)
width_new
=
window_size
*
w_ratio
if
h_ratio
<
1
:
height_new
=
img_height
else
:
decimal_h
=
h_ratio
-
img_height
//
window_size
h_ratio
=
int
(
h_ratio
)
+
1
if
decimal_h
>
0.2
else
int
(
h_ratio
)
height_new
=
window_size
*
h_ratio
return
int
(
width_new
),
int
(
height_new
)
def
patch_crop
(
self
,
img
:
Image
.
Image
,
i
:
int
,
j
:
int
,
th
:
int
,
tw
:
int
):
target
=
img
.
crop
((
j
,
i
,
j
+
tw
,
i
+
th
))
return
target
def
get_num_patches
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
img_width
,
img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
img_width
,
img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
window_size
=
self
.
determine_window_size
(
max
(
img_height
,
img_width
),
min
(
img_height
,
img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
0
,
0
else
:
img_width
,
img_height
=
self
.
get_image_size_for_crop
(
img_width
,
img_height
,
window_size
)
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
img_width
,
img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
full_rows
=
(
len
(
center_list
)
-
1
)
//
x_num
+
1
if
len
(
center_list
)
>
0
and
len
(
center_list
)
%
x_num
==
0
:
full_rows
-=
1
return
len
(
center_list
),
full_rows
def
__call__
(
self
,
img
:
Image
.
Image
)
->
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]:
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
if
new_img_width
!=
img_width
or
new_img_height
!=
img_height
:
img
=
self
.
square_pad
(
img
)
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
img
=
img
.
resize
((
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
window_size
=
self
.
determine_window_size
(
max
(
new_img_height
,
new_img_width
),
min
(
new_img_height
,
new_img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
img
,
[],
None
else
:
new_img_width
,
new_img_height
=
self
.
get_image_size_for_crop
(
new_img_width
,
new_img_height
,
window_size
)
if
(
new_img_width
,
new_img_height
)
!=
(
img_width
,
img_height
):
img_for_crop
=
img
.
resize
(
(
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
else
:
img_for_crop
=
img
patches
=
[]
newlines
=
[]
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
new_img_width
,
new_img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
for
patch_id
,
center_lf_point
in
enumerate
(
center_list
):
x
,
y
,
patch_w
,
patch_h
=
center_lf_point
big_patch
=
self
.
patch_crop
(
img_for_crop
,
y
,
x
,
patch_h
,
patch_w
)
patches
.
append
(
big_patch
)
if
(
patch_id
+
1
)
%
x_num
==
0
:
newlines
.
append
(
patch_id
)
if
newlines
and
newlines
[
-
1
]
==
len
(
patches
)
-
1
:
newlines
.
pop
()
return
(
img
,
patches
,
[
i
in
newlines
for
i
in
range
(
len
(
patches
))]
if
len
(
patches
)
>
0
else
None
,
)
class
Step3VLProcessor
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_size
=
728
self
.
patch_size
=
504
self
.
image_preprocessor
=
Step3VisionProcessor
(
self
.
image_size
,
"bilinear"
,
self
.
patch_size
)
self
.
num_image_feature_size
=
169
self
.
num_patch_feature_size
=
81
self
.
image_token
=
"<im_patch>"
self
.
image_feature_placeholder
=
self
.
image_token
*
self
.
num_image_feature_size
self
.
patch_feature_placeholder
=
self
.
image_token
*
self
.
num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch
=
getattr
(
self
.
config
.
vision_config
,
"enable_patch"
,
True
)
self
.
patcher
=
ImagePatcher
(
enable_patch
=
enable_patch
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
image_token
]
def
get_num_image_tokens
(
self
,
img_width
:
int
,
img_height
:
int
)
->
int
:
num_patches
,
num_newlines
=
self
.
patcher
.
get_num_patches
(
img_width
,
img_height
)
return
(
num_patches
*
(
self
.
num_patch_feature_size
+
2
)
+
self
.
num_image_feature_size
+
2
+
num_newlines
)
def
_split_images
(
self
,
images
:
list
[
Image
.
Image
])
->
list
[
ImageWithPatches
]:
result
=
[]
for
img
in
images
:
result
.
append
(
self
.
patcher
(
img
))
return
result
def
_convert_images_to_pixel_values
(
self
,
images
:
list
[
Image
.
Image
],
is_patch
:
bool
=
False
,
)
->
list
[
torch
.
Tensor
]:
return
[
self
.
image_preprocessor
(
img
,
is_patch
=
is_patch
)[
"pixel_values"
]
for
img
in
images
]
def
_get_patch_repl
(
self
,
num_patches
:
int
,
patch_newline_mask
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
""
token_ids
=
[]
for
i
in
range
(
num_patches
):
assert
len
(
patch_newline_mask
)
==
num_patches
text
+=
f
"<patch_start>
{
self
.
patch_feature_placeholder
}
<patch_end>"
token_ids
.
extend
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_patch_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_end>"
)]
)
if
patch_newline_mask
and
patch_newline_mask
[
i
]:
text
+=
"<patch_newline>"
token_ids
.
append
(
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_newline>"
)
)
return
text
,
token_ids
def
_get_image_repl
(
self
,
num_images
:
int
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
f
"<im_start>
{
self
.
image_feature_placeholder
}
<im_end>"
token_ids
=
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_image_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_end>"
)]
)
return
text
*
num_images
,
token_ids
*
num_images
def
_get_image_repl_features
(
self
,
num_images
:
int
,
num_patches
:
int
,
patch_new_line_idx
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
if
num_patches
>
0
:
patch_repl
,
patch_repl_ids
=
self
.
_get_patch_repl
(
num_patches
,
patch_new_line_idx
)
else
:
patch_repl
=
""
patch_repl_ids
=
[]
image_repl
,
image_repl_ids
=
self
.
_get_image_repl
(
num_images
)
return
patch_repl
+
image_repl
,
patch_repl_ids
+
image_repl_ids
def
replace_placeholder
(
self
,
text
:
str
,
placeholder
:
str
,
repls
:
list
[
str
])
->
str
:
parts
=
text
.
split
(
placeholder
)
if
len
(
parts
)
-
1
!=
len
(
repls
):
raise
ValueError
(
"The number of placeholders does not match the number of replacements."
)
result
=
[
parts
[
0
]]
for
i
,
repl
in
enumerate
(
repls
):
result
.
append
(
repl
)
result
.
append
(
parts
[
i
+
1
])
return
""
.
join
(
result
)
class
Step3VLProcessingInfo
(
BaseProcessingInfo
):
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
text_inputs
=
self
.
tokenizer
(
text
)
else
:
split_images_data
=
self
.
_split_images
(
images
)
pixel_values_lst
=
[]
patch_pixel_values_lst
=
[]
patch_newline_mask_lst
=
[]
image_repl_str_lst
=
[]
image_repl_ids_lst
=
[]
num_patches
=
[]
for
raw_img
,
img_patches
,
patch_newline_mask
in
split_images_data
:
pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
([
raw_img
]))
if
len
(
img_patches
)
>
0
:
patch_pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
(
img_patches
,
is_patch
=
True
)
)
num_patches
.
append
(
len
(
img_patches
))
image_repl_str
,
image_repl_ids
=
self
.
_get_image_repl_features
(
1
,
len
(
img_patches
),
patch_newline_mask
)
image_repl_str_lst
.
append
(
image_repl_str
)
image_repl_ids_lst
.
extend
(
image_repl_ids
)
if
patch_newline_mask
is
not
None
:
patch_newline_mask_lst
.
extend
(
patch_newline_mask
)
pixel_values
=
torch
.
cat
(
pixel_values_lst
)
patch_size
=
self
.
patch_size
image_inputs
=
{
"pixel_values"
:
pixel_values
,
"num_patches"
:
num_patches
,
"patch_pixel_values"
:
(
torch
.
cat
(
patch_pixel_values_lst
)
if
patch_pixel_values_lst
else
pixel_values
.
new_empty
((
0
,
3
,
patch_size
,
patch_size
))
),
"patch_newline_mask"
:
torch
.
tensor
(
patch_newline_mask_lst
,
dtype
=
torch
.
bool
),
}
text
=
[
self
.
replace_placeholder
(
t
,
self
.
image_token
,
image_repl_str_lst
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
kwargs
.
setdefault
(
"enable_patch"
,
getattr
(
config
.
vision_config
,
"enable_patch"
,
True
),
)
return
Step3VLImageProcessor
(
**
kwargs
)
class
Step3VLProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
)
->
Step3VLProcessor
:
return
Step3VLProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenize
r
(),
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
self
.
get_image_processo
r
(),
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
}
def
get_max_image_tokens
(
self
)
->
int
:
hf_processor
=
self
.
get_hf_processor
()
return
hf_processor
.
get_num_image_tokens
(
self
.
get_image_size_with_most_features
().
width
,
self
.
get_image_size_with_most_features
().
height
,
)
image_processor
=
self
.
get_image_processor
()
target_width
,
target_height
=
self
.
get_image_size_with_most_features
()
return
image_processor
.
get_num_image_tokens
(
target_width
,
target_height
)
def
get_mm_max_tokens_per_item
(
self
,
...
...
@@ -539,20 +123,7 @@ class Step3VLProcessingInfo(BaseProcessingInfo):
return
{
"image"
:
self
.
get_max_image_tokens
()}
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
return
ImageSize
(
3024
,
3024
)
def
get_num_mm_tokens
(
self
,
mm_data
:
MultiModalDataDict
)
->
int
:
if
len
(
mm_data
)
!=
1
or
"image"
not
in
mm_data
:
raise
ValueError
(
"mm_data could only contain one key 'image' for steo1o"
)
image_data
=
mm_data
[
"image"
]
if
not
isinstance
(
image_data
,
(
list
,
tuple
)):
image_data
=
[
image_data
]
return
sum
(
self
.
get_hf_processor
().
get_num_image_tokens
(
img
.
width
,
img
.
height
)
for
img
in
image_data
)
return
ImageSize
(
MAX_IMAGE_SIZE
,
MAX_IMAGE_SIZE
)
class
Step3VLDummyInputsBuilder
(
BaseDummyInputsBuilder
[
Step3VLProcessingInfo
]):
...
...
@@ -594,13 +165,11 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo])
def
get_replacement_step1o
(
item_idx
:
int
):
out_item
=
out_mm_kwargs
[
"image"
][
item_idx
]
num_patches
=
int
(
out_item
[
"num_patches"
].
data
)
if
num_patches
>
0
:
patch_newline_mask
=
out_item
[
"patch_newline_mask"
].
data
image_repl_ids
=
hf_processor
.
_get_image_repl_features
(
1
,
num_patches
,
patch_newline_mask
.
tolist
()
)[
1
]
else
:
image_repl_ids
=
hf_processor
.
_get_image_repl_features
(
1
,
0
,
None
)[
1
]
patch_newline_mask
=
out_item
[
"patch_newline_mask"
].
data
image_repl_ids
=
hf_processor
.
get_image_repl_feature_ids
(
1
,
num_patches
,
patch_newline_mask
.
tolist
()
)
return
PromptUpdateDetails
.
select_token_id
(
seq
=
image_repl_ids
,
embed_token_id
=
image_placeholder_token_id
,
...
...
vllm/model_executor/models/tarsier.py
View file @
0da93439
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.llava
import
LlavaDummyInputsBuilder
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargsItems
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
...
...
@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
MultiModalDataItems
,
)
from
vllm.multimodal.processing
import
(
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
BaseProcessingInfo
,
InputProcessingContext
,
PromptReplacement
,
PromptUpdate
,
)
...
...
@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
]
def
_build_tarsier_hf_info
(
ctx
:
InputProcessingContext
)
->
TarsierProcessingInfo
:
return
TarsierProcessingInfo
(
ctx
)
def
_build_tarsier_hf_processor
(
info
:
_I_Tarsier
,
dummy_inputs
:
BaseDummyInputsBuilder
[
_I_Tarsier
],
*
,
cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
)
->
BaseMultiModalProcessor
:
if
isinstance
(
info
,
TarsierProcessingInfo
):
return
TarsierMultiModalProcessor
(
info
,
dummy_inputs
,
cache
=
cache
,
)
raise
NotImplementedError
(
type
(
info
))
def
init_vision_tower_for_tarsier
(
hf_config
:
TarsierHfConfig
,
# Use the Tarsier specific config protocol
quant_config
:
QuantizationConfig
|
None
,
...
...
@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(
@
MULTIMODAL_REGISTRY
.
register_processor
(
_build_tarsier_hf_p
rocessor
,
info
=
_build_tarsier_hf_i
nfo
,
TarsierMultiModalP
rocessor
,
info
=
TarsierProcessingI
nfo
,
dummy_inputs
=
TarsierDummyInputsBuilder
,
)
class
TarsierForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
...
...
vllm/model_executor/models/ultravox.py
View file @
0da93439
...
...
@@ -404,12 +404,14 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
kwargs
[
"layer_head_mask"
]
=
None
for
layer
in
self
.
layers
:
layer_output
s
=
layer
(
hidden_state
s
=
layer
(
hidden_states
,
attention_mask
=
extended_attention_mask
,
**
kwargs
,
)
hidden_states
=
layer_outputs
[
0
]
# BC version that allows for the old tupled output
if
isinstance
(
hidden_states
,
tuple
):
hidden_states
=
hidden_states
[
0
]
hidden_states
=
self
.
ln_post
(
hidden_states
)
hidden_states
=
self
.
linear_out
(
hidden_states
)
...
...
@@ -509,13 +511,14 @@ class ModifiedWhisperEncoder(WhisperEncoder):
kwargs
[
"layer_head_mask"
]
=
None
for
encoder_layer
in
self
.
layers
:
layer_output
s
=
encoder_layer
(
hidden_state
s
=
encoder_layer
(
hidden_states
,
attention_mask
,
**
kwargs
,
)
hidden_states
=
layer_outputs
[
0
]
# BC version that allows for the old tupled output
if
isinstance
(
hidden_states
,
tuple
):
hidden_states
=
hidden_states
[
0
]
hidden_states
=
self
.
layer_norm
(
hidden_states
)
return
hidden_states
...
...
vllm/model_executor/models/whisper_causal.py
View file @
0da93439
...
...
@@ -150,8 +150,10 @@ def create_whisper_attention_backend_with_block_pooling(
new_common_attn_metadata
.
query_start_loc
*=
block_pool_size
new_common_attn_metadata
.
query_start_loc_cpu
*=
block_pool_size
new_common_attn_metadata
.
seq_lens
*=
block_pool_size
new_common_attn_metadata
.
_seq_lens_cpu
*=
block_pool_size
new_common_attn_metadata
.
_num_computed_tokens_cpu
*=
block_pool_size
if
new_common_attn_metadata
.
_seq_lens_cpu
is
not
None
:
new_common_attn_metadata
.
_seq_lens_cpu
*=
block_pool_size
if
new_common_attn_metadata
.
_num_computed_tokens_cpu
is
not
None
:
new_common_attn_metadata
.
_num_computed_tokens_cpu
*=
block_pool_size
new_common_attn_metadata
.
num_actual_tokens
*=
block_pool_size
new_common_attn_metadata
.
max_query_len
*=
block_pool_size
new_common_attn_metadata
.
max_seq_len
*=
block_pool_size
...
...
vllm/model_executor/offloader/prefetch.py
View file @
0da93439
...
...
@@ -431,10 +431,32 @@ class _ModuleOffloader:
Called after process_weights_after_loading to ensure _cpu_storage
contains the final processed weights, not stale pre-loading data.
Parameters whose underlying nn.Parameter was deleted by
process_weights_after_loading (e.g. transient KV-cache scale params)
are pruned from self._param_offloaders so they do not participate in
buffer-pool allocation or prefetching.
"""
for
param_offloader
in
self
.
_param_offloaders
.
values
():
param_offloader
.
sync_cpu_storage
()
# Remove offloaders whose parameter was deleted during
# process_weights_after_loading (e.g. k_scale / v_scale).
deleted
=
[
name
for
name
,
offloader
in
self
.
_param_offloaders
.
items
()
if
getattr
(
offloader
,
"_param_deleted"
,
False
)
]
if
deleted
:
logger
.
debug
(
"Pruning %d transient offloaded param(s) that were deleted "
"by process_weights_after_loading: %s"
,
len
(
deleted
),
deleted
,
)
for
name
in
deleted
:
del
self
.
_param_offloaders
[
name
]
def
get_param_infos
(
self
)
->
list
[
ParamInfo
]:
"""Get parameter metadata for buffer pool allocation.
...
...
@@ -590,6 +612,11 @@ class _CpuParamOffloader(_BaseParamOffloader):
super
().
__init__
(
module
,
param_name
)
self
.
_cpu_storage
:
torch
.
Tensor
|
None
=
None
self
.
_gpu_buffer
:
torch
.
Tensor
|
None
=
None
# Store reference to GPU buffer
# Set to True if the underlying nn.Parameter was deleted by
# process_weights_after_loading (e.g. transient KV-cache scale params
# such as k_scale/v_scale created by BaseKVCacheMethod.create_weights
# and deleted after copying into permanent _k_scale buffers).
self
.
_param_deleted
:
bool
=
False
# Offload to CPU immediately to free GPU memory during model loading
self
.
_offload_to_cpu_internal
()
...
...
@@ -696,8 +723,22 @@ class _CpuParamOffloader(_BaseParamOffloader):
1. process_weights_after_loading may transform weights (quantization)
2. device_loading_context creates NEW CPU tensors when moving back
3. Our old _cpu_storage would have pre-processed or stale data
If the parameter no longer exists on the module (e.g. transient
KV-cache scale parameters such as k_scale/v_scale that are created
by BaseKVCacheMethod.create_weights() and then deleted by
process_weights_after_loading() after copying their values into
permanent _k_scale buffers), the offloader marks itself as deleted
and skips the sync. The caller (_ModuleOffloader.sync_cpu_storage)
is responsible for removing these stale entries.
"""
self
.
_update_cpu_storage_from_param
()
try
:
self
.
_update_cpu_storage_from_param
()
except
AttributeError
:
# The parameter was deleted by process_weights_after_loading.
# Drop the now-stale CPU storage so this offloader can be pruned.
self
.
_param_deleted
=
True
self
.
_cpu_storage
=
None
def
post_init
(
self
):
"""No-op: offloading done in offload_to_cpu/assign_static_buffer."""
...
...
vllm/model_executor/warmup/deep_gemm_warmup.py
View file @
0da93439
...
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
)
from
vllm.model_executor.layers.linear
import
LinearBase
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
from
vllm.model_executor.layers.quantization.mxfp8
import
Mxfp8OnlineLinearMethod
from
vllm.tracing
import
instrument
from
vllm.utils.deep_gemm
import
(
fp8_gemm_nt
,
...
...
@@ -136,8 +137,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
if
not
(
isinstance
(
module
,
LinearBase
)
and
isinstance
(
module
.
quant_method
,
Fp8LinearMethod
)
and
module
.
quant_method
.
block_quant
and
not
module
.
quant_method
.
use_marlin
and
not
isinstance
(
module
.
quant_method
,
Mxfp8OnlineLinearMethod
)
and
getattr
(
module
.
quant_method
,
"block_quant"
,
False
)
and
not
getattr
(
module
.
quant_method
,
"use_marlin"
,
True
)
):
return
False
...
...
vllm/multimodal/audio.py
View file @
0da93439
...
...
@@ -12,17 +12,35 @@ import torch
from
vllm.utils.import_utils
import
PlaceholderModule
try
:
import
librosa
import
av
as
av
except
ImportError
:
libros
a
=
PlaceholderModule
(
"
libros
a"
)
# type: ignore[assignment]
a
v
=
PlaceholderModule
(
"a
v
"
)
# type: ignore[assignment]
try
:
import
resampy
except
ImportError
:
resampy
=
PlaceholderModule
(
"resampy"
)
# type: ignore[assignment]
try
:
import
scipy.signal
as
scipy_signal
except
ImportError
:
scipy_signal
=
PlaceholderModule
(
"scipy"
).
placeholder_attr
(
"signal"
)
# type: ignore[assignment]
# ============================================================
# Aligned with `librosa.get_duration` function
def
get_audio_duration
(
*
,
y
:
npt
.
NDArray
[
np
.
floating
],
sr
:
float
=
22050
)
->
float
:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples
=
y
.
shape
[
-
1
]
return
float
(
n_samples
)
/
sr
class
ChannelReduction
(
str
,
Enum
):
...
...
@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================
def
resample_audio_
librosa
(
def
resample_audio_
pyav
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
librosa
.
resample
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
"""Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int
=
int
(
round
(
orig_sr
))
target_sr_int
=
int
(
round
(
target_sr
))
if
orig_sr_int
==
target_sr_int
:
return
audio
if
audio
.
ndim
==
2
:
# Resample each channel independently and re-stack.
return
np
.
stack
(
[
resample_audio_pyav
(
ch
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
for
ch
in
audio
],
axis
=
0
,
)
expected_len
=
int
(
math
.
ceil
(
audio
.
shape
[
-
1
]
*
target_sr_int
/
orig_sr_int
))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES
=
1024
audio_f32
=
np
.
asarray
(
audio
,
dtype
=
np
.
float32
)
if
len
(
audio_f32
)
<
_MIN_SAMPLES
:
audio_f32
=
np
.
pad
(
audio_f32
,
(
0
,
_MIN_SAMPLES
-
len
(
audio_f32
)))
audio_f32
=
audio_f32
.
reshape
(
1
,
-
1
)
resampler
=
av
.
AudioResampler
(
format
=
"fltp"
,
layout
=
"mono"
,
rate
=
target_sr_int
)
frame
=
av
.
AudioFrame
.
from_ndarray
(
audio_f32
,
format
=
"fltp"
,
layout
=
"mono"
)
frame
.
sample_rate
=
orig_sr_int
out_frames
=
resampler
.
resample
(
frame
)
out_frames
.
extend
(
resampler
.
resample
(
None
))
# flush buffered samples
result
=
np
.
concatenate
([
f
.
to_ndarray
()
for
f
in
out_frames
],
axis
=
1
).
squeeze
(
0
)
return
result
[:
expected_len
]
def
resample_audio_resampy
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
resampy
.
resample
(
audio
,
sr_orig
=
orig_sr
,
sr_new
=
target_sr
)
def
resample_audio_scipy
(
...
...
@@ -167,7 +243,7 @@ def resample_audio_scipy(
*
,
orig_sr
:
float
,
target_sr
:
float
,
):
)
->
npt
.
NDArray
[
np
.
floating
]
:
if
orig_sr
>
target_sr
:
return
scipy_signal
.
resample_poly
(
audio
,
1
,
orig_sr
//
target_sr
)
elif
orig_sr
<
target_sr
:
...
...
@@ -181,7 +257,7 @@ class AudioResampler:
def
__init__
(
self
,
target_sr
:
float
|
None
=
None
,
method
:
Literal
[
"
librosa
"
,
"scipy"
]
=
"
librosa
"
,
method
:
Literal
[
"
pyav"
,
"resampy
"
,
"scipy"
]
=
"
resampy
"
,
):
self
.
target_sr
=
target_sr
self
.
method
=
method
...
...
@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol
=
1e-6
,
):
return
audio
if
self
.
method
==
"librosa"
:
return
resample_audio_librosa
(
if
self
.
method
==
"pyav"
:
return
resample_audio_pyav
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
self
.
target_sr
)
if
self
.
method
==
"resampy"
:
return
resample_audio_resampy
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
self
.
target_sr
)
elif
self
.
method
==
"scipy"
:
...
...
@@ -214,7 +292,7 @@ class AudioResampler:
else
:
raise
ValueError
(
f
"Invalid resampling method:
{
self
.
method
}
. "
"Supported methods are '
librosa
' and 'scipy'."
"Supported methods are '
pyav
' and 'scipy'."
)
...
...
vllm/multimodal/media/audio.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
math
from
io
import
BytesIO
from
pathlib
import
Path
...
...
@@ -15,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from
.base
import
MediaIO
try
:
import
libros
a
import
a
v
except
ImportError
:
libros
a
=
PlaceholderModule
(
"
libros
a"
)
# type: ignore[assignment]
a
v
=
PlaceholderModule
(
"a
v
"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
try
:
import
av
import
resampy
except
ImportError
:
av
=
PlaceholderModule
(
"
av
"
)
# type: ignore[assignment]
resampy
=
PlaceholderModule
(
"
resampy
"
)
# type: ignore[assignment]
def
extract_audio_from_video_bytes
(
data
:
bytes
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
"""Extract the audio track from raw video bytes using PyAV.
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES
=
{
1
,
3
,
4
}
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's
audio stream. Resampling to a model-specific rate is left to the
downstream :class:`AudioResampler` in the parsing pipeline.
def
load_audio_pyav
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args:
data: Raw video file bytes (e.g. from an mp4 file).
path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns:
A tuple of
``(waveform, sample_rate)``
suitable for use as an
:class:`AudioItem`
.
``(waveform, sample_rate)``
where *waveform* is a 1-D float32
NumPy array and *sample_rate* is the native sample rate in Hz
.
"""
if
data
is
None
or
len
(
data
)
==
0
:
raise
ValueError
(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
native_sr
=
None
try
:
with
av
.
open
(
BytesIO
(
data
)
)
as
container
:
with
av
.
open
(
path
)
as
container
:
if
not
container
.
streams
.
audio
:
raise
ValueError
(
"No audio stream found
in the video
."
)
raise
ValueError
(
"No audio stream found."
)
stream
=
container
.
streams
.
audio
[
0
]
stream
.
thread_type
=
"AUTO"
native_sr
=
stream
.
rate
sr
=
sr
or
native_sr
chunks
:
list
[
npt
.
NDArray
]
=
[]
for
frame
in
container
.
decode
(
audio
=
0
):
arr
=
frame
.
to_ndarray
()
chunks
.
append
(
arr
.
mean
(
axis
=
0
)
if
arr
.
ndim
>
1
else
arr
)
needs_resampling
=
not
math
.
isclose
(
float
(
sr
),
float
(
native_sr
),
rel_tol
=
0.0
,
abs_tol
=
1e-6
,
)
resampler
=
(
av
.
AudioResampler
(
format
=
"fltp"
,
layout
=
"mono"
,
rate
=
sr
)
if
needs_resampling
else
None
)
for
frame
in
container
.
decode
(
stream
):
if
needs_resampling
:
assert
resampler
is
not
None
for
out_frame
in
resampler
.
resample
(
frame
):
chunks
.
append
(
out_frame
.
to_ndarray
())
else
:
chunks
.
append
(
frame
.
to_ndarray
())
except
ValueError
:
raise
except
Exception
as
e
:
...
...
@@ -78,37 +100,54 @@ def extract_audio_from_video_bytes(
if
not
chunks
:
raise
ValueError
(
"No audio found in the video."
)
audio
=
np
.
concatenate
(
chunks
).
astype
(
np
.
float32
)
return
audio
,
float
(
native_sr
)
audio
=
np
.
concatenate
(
chunks
,
axis
=-
1
).
astype
(
np
.
float32
)
if
mono
and
audio
.
ndim
>
1
:
audio
=
np
.
mean
(
audio
,
axis
=
0
)
return
audio
,
sr
def
is_video
(
data
:
bytes
)
->
bool
:
"""Check if the fetched bytes are video"""
if
len
(
data
)
<
12
:
return
False
box_type
=
data
[
4
:
8
]
major_brand
=
data
[
8
:
12
]
def
load_audio_soundfile
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
)
->
tuple
[
np
.
ndarray
,
int
]:
"""Load audio via soundfile"""
with
soundfile
.
SoundFile
(
path
)
as
f
:
native_sr
=
f
.
samplerate
y
=
f
.
read
(
dtype
=
"float32"
,
always_2d
=
False
).
T
MP4_BRANDS
=
{
b
"mp41"
,
b
"mp42"
,
# MP4
b
"isom"
,
# ISO Base Media
b
"iso2"
,
b
"iso4"
,
b
"iso5"
,
b
"iso6"
,
b
"M4V "
,
b
"M4A "
,
# Apple
b
"avc1"
,
# H.264
b
"dash"
,
# DASH
b
"mmp4"
,
b
"MSNV"
,
}
if
mono
and
y
.
ndim
>
1
:
y
=
np
.
mean
(
y
,
axis
=
tuple
(
range
(
y
.
ndim
-
1
)))
is_avi
=
data
[:
4
]
==
b
"RIFF"
and
major_brand
==
b
"AVI "
is_mp4
=
box_type
==
b
"ftyp"
and
major_brand
in
MP4_BRANDS
return
is_mp4
or
is_avi
if
sr
is
not
None
and
sr
!=
native_sr
:
y
=
resampy
.
resample
(
y
,
sr_orig
=
native_sr
,
sr_new
=
sr
)
return
y
,
int
(
sr
)
return
y
,
native_sr
def
load_audio
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
):
try
:
return
load_audio_soundfile
(
path
,
sr
=
sr
,
mono
=
mono
)
except
soundfile
.
LibsndfileError
as
exc
:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if
exc
.
code
not
in
_BAD_SF_CODES
:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if
isinstance
(
path
,
BytesIO
):
path
.
seek
(
0
)
try
:
return
load_audio_pyav
(
path
,
sr
=
sr
,
mono
=
mono
)
except
Exception
as
pyav_exc
:
raise
ValueError
(
"Invalid or unsupported audio file."
)
from
pyav_exc
class
AudioMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
float
]]):
...
...
@@ -129,19 +168,17 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self
.
kwargs
=
kwargs
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
float
]:
if
is_video
(
data
):
return
extract_audio_from_video_bytes
(
data
)
return
librosa
.
load
(
BytesIO
(
data
),
sr
=
None
)
return
load_audio
(
BytesIO
(
data
),
sr
=
None
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
return
self
.
load_bytes
(
py
base64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
l
ibrosa
.
load
(
filepath
,
sr
=
None
)
return
l
oad_audio
(
filepath
,
sr
=
None
)
def
encode_base64
(
self
,
...
...
@@ -155,7 +192,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
soundfile
.
write
(
buffer
,
audio
,
sr
,
format
=
audio_format
)
data
=
buffer
.
getvalue
()
return
base64
.
b64encode
(
data
).
decode
(
"utf-8"
)
return
py
base64
.
b64encode
(
data
).
decode
(
"utf-8"
)
class
AudioEmbeddingMediaIO
(
MediaIO
[
torch
.
Tensor
]):
...
...
vllm/multimodal/media/video.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
functools
import
partial
from
pathlib
import
Path
from
typing
import
Any
import
numpy
as
np
import
numpy.typing
as
npt
import
pybase64
from
PIL
import
Image
from
vllm
import
envs
...
...
@@ -80,11 +80,23 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
"image/jpeg"
,
)
return
np
.
stack
(
frames
=
np
.
stack
(
[
np
.
asarray
(
load_frame
(
frame_data
))
for
frame_data
in
data
.
split
(
","
)]
),
{}
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
)
total
=
int
(
frames
.
shape
[
0
])
fps
=
float
(
self
.
kwargs
.
get
(
"fps"
,
1
))
duration
=
total
/
fps
if
fps
>
0
else
0.0
metadata
=
{
"total_num_frames"
:
total
,
"fps"
:
fps
,
"duration"
:
duration
,
"video_backend"
:
"jpeg_sequence"
,
"frames_indices"
:
list
(
range
(
total
)),
"do_sample_frames"
:
False
,
}
return
frames
,
metadata
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
with
filepath
.
open
(
"rb"
)
as
f
:
...
...
vllm/multimodal/parse.py
View file @
0da93439
...
...
@@ -497,7 +497,7 @@ class MultiModalDataParser:
*
,
target_sr
:
float
|
None
=
None
,
target_channels
:
int
|
None
=
None
,
audio_resample_method
:
Literal
[
"
librosa
"
,
"scipy"
]
=
"
librosa
"
,
audio_resample_method
:
Literal
[
"
pyav
"
,
"scipy"
]
=
"
pyav
"
,
video_needs_metadata
:
bool
=
False
,
expected_hidden_size
:
int
|
None
=
None
,
)
->
None
:
...
...
vllm/multimodal/processing/processor.py
View file @
0da93439
...
...
@@ -1682,6 +1682,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
class
EncDecMultiModalProcessor
(
BaseMultiModalProcessor
[
_I
]):
skip_decoder_start_token
:
bool
=
False
@
abstractmethod
def
create_encoder_prompt
(
self
,
...
...
vllm/parser/abstract_parser.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
json
from
abc
import
abstractmethod
from
collections.abc
import
Sequence
...
...
@@ -18,7 +19,7 @@ from openai.types.responses.response_output_text import Logprob
from
openai.types.responses.response_reasoning_item
import
(
Content
as
ResponseReasoningTextContent
,
)
from
pydantic
import
TypeAdapter
from
pydantic
import
TypeAdapter
,
ValidationError
from
vllm.entrypoints.chat_utils
import
make_tool_call_id
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
...
...
@@ -154,7 +155,9 @@ class Parser:
@
abstractmethod
def
extract_response_outputs
(
self
,
*
,
model_output
:
str
,
model_output_token_ids
:
Sequence
[
int
],
request
:
ResponsesRequest
,
enable_auto_tools
:
bool
=
False
,
tool_call_id_type
:
str
=
"random"
,
...
...
@@ -169,6 +172,7 @@ class Parser:
Args:
model_output: The complete model-generated string.
model_output_token_ids: The token IDs of the model output.
request: The request object used to generate the output.
enable_auto_tools: Whether to enable automatic tool call parsing.
tool_call_id_type: Type of tool call ID generation ("random", etc).
...
...
@@ -195,7 +199,7 @@ class Parser:
request: The request object used to generate the output.
Returns:
A tuple of (reasoning
_content
, response_content).
A tuple of (reasoning, response_content).
"""
@
abstractmethod
...
...
@@ -312,7 +316,9 @@ class DelegatingParser(Parser):
def
extract_response_outputs
(
self
,
*
,
model_output
:
str
,
model_output_token_ids
:
Sequence
[
int
],
request
:
ResponsesRequest
,
enable_auto_tools
:
bool
=
False
,
tool_call_id_type
:
str
=
"random"
,
...
...
@@ -422,15 +428,19 @@ class DelegatingParser(Parser):
if
request
.
tool_choice
==
"required"
:
# Required tool calls - parse JSON
assert
content
is
not
None
tool_calls
=
TypeAdapter
(
list
[
FunctionDefinition
]).
validate_json
(
content
)
function_calls
.
extend
(
FunctionCall
(
name
=
tool_call
.
name
,
arguments
=
json
.
dumps
(
tool_call
.
parameters
,
ensure_ascii
=
False
),
tool_calls
=
[]
with
contextlib
.
suppress
(
ValidationError
):
content
=
content
or
""
tool_calls
=
TypeAdapter
(
list
[
FunctionDefinition
]).
validate_json
(
content
)
for
tool_call
in
tool_calls
:
function_calls
.
append
(
FunctionCall
(
name
=
tool_call
.
name
,
arguments
=
json
.
dumps
(
tool_call
.
parameters
,
ensure_ascii
=
False
),
)
)
for
tool_call
in
tool_calls
)
return
function_calls
,
None
# Clear content since tool is called.
if
(
...
...
vllm/parser/parser_manager.py
View file @
0da93439
...
...
@@ -199,7 +199,7 @@ class ParserManager:
parser
:
type
[
ToolParser
]
|
None
=
None
if
not
enable_auto_tools
or
tool_parser_name
is
None
:
return
parser
logger
.
info
(
'"auto" tool choice has been enabled.'
)
logger
.
info
_once
(
'"auto" tool choice has been enabled.'
)
try
:
if
(
...
...
vllm/platforms/cpu.py
View file @
0da93439
...
...
@@ -281,6 +281,9 @@ class CpuPlatform(Platform):
# Disable multi-stream for shared experts as no Stream on CPU
os
.
environ
[
"VLLM_DISABLE_SHARED_EXPERTS_STREAM"
]
=
"1"
# Avoid inductor generates num_thread() and breaks the thread binding
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
# Intel OpenMP setting
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
if
"libiomp5.so"
in
ld_preload_str
:
...
...
vllm/platforms/cuda.py
View file @
0da93439
...
...
@@ -4,6 +4,8 @@
pynvml. However, it should not initialize cuda context.
"""
from
__future__
import
annotations
import
os
from
collections.abc
import
Callable
from
datetime
import
timedelta
...
...
@@ -17,6 +19,7 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
import
vllm._C
# noqa
import
vllm._C_stable_libtorch
# noqa
from
vllm.logger
import
init_logger
from
vllm.utils.import_utils
import
import_pynvml
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
...
...
@@ -49,21 +52,34 @@ def _get_backend_priorities(
use_mla
:
bool
,
device_capability
:
DeviceCapability
,
num_heads
:
int
|
None
=
None
,
kv_cache_dtype
:
CacheDType
|
None
=
None
,
)
->
list
[
AttentionBackendEnum
]:
"""Get backend priorities with lazy import to avoid circular dependency."""
if
use_mla
:
if
device_capability
.
major
==
10
:
# Prefer FlashInfer at low head counts (FlashMLA uses padding)
if
num_heads
is
not
None
and
num_heads
<=
16
:
# Sparse MLA backend priorities
# See https://github.com/vllm-project/vllm/issues/35807 for
# benchmark results
if
kv_cache_dtype
is
not
None
and
kv_cache_dtype
.
startswith
(
"fp8"
):
# Prefer FlashInfer for fp8 kv cache
sparse_backends
=
[
AttentionBackendEnum
.
FLASHINFER_MLA_SPARSE
,
AttentionBackendEnum
.
FLASHMLA_SPARSE
,
]
else
:
sparse_backends
=
[
AttentionBackendEnum
.
FLASHMLA_SPARSE
,
AttentionBackendEnum
.
FLASHINFER_MLA_SPARSE
,
]
# BF16 KV Cache
# Prefer FlashInfer at low head counts (FlashMLA uses padding)
if
num_heads
is
not
None
and
num_heads
<=
16
:
sparse_backends
=
[
AttentionBackendEnum
.
FLASHINFER_MLA_SPARSE
,
AttentionBackendEnum
.
FLASHMLA_SPARSE
,
]
else
:
sparse_backends
=
[
AttentionBackendEnum
.
FLASHMLA_SPARSE
,
AttentionBackendEnum
.
FLASHINFER_MLA_SPARSE
,
]
return
[
AttentionBackendEnum
.
FLASHINFER_MLA
,
AttentionBackendEnum
.
CUTLASS_MLA
,
...
...
@@ -165,7 +181,7 @@ class CudaPlatformBase(Platform):
pass
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
"
VllmConfig
"
)
->
None
:
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
parallel_config
=
vllm_config
.
parallel_config
model_config
=
vllm_config
.
model_config
...
...
@@ -198,11 +214,11 @@ class CudaPlatformBase(Platform):
def
get_valid_backends
(
cls
,
device_capability
:
DeviceCapability
,
attn_selector_config
:
"
AttentionSelectorConfig
"
,
attn_selector_config
:
AttentionSelectorConfig
,
num_heads
:
int
|
None
=
None
,
)
->
tuple
[
list
[
tuple
[
"
AttentionBackendEnum
"
,
int
]],
dict
[
"
AttentionBackendEnum
"
,
tuple
[
int
,
list
[
str
]]],
list
[
tuple
[
AttentionBackendEnum
,
int
]],
dict
[
AttentionBackendEnum
,
tuple
[
int
,
list
[
str
]]],
]:
valid_backends_priorities
=
[]
invalid_reasons
:
dict
[
AttentionBackendEnum
,
tuple
[
int
,
list
[
str
]]]
=
{}
...
...
@@ -211,6 +227,7 @@ class CudaPlatformBase(Platform):
attn_selector_config
.
use_mla
,
device_capability
,
num_heads
,
attn_selector_config
.
kv_cache_dtype
,
)
for
priority
,
backend
in
enumerate
(
backend_priorities
):
try
:
...
...
@@ -231,8 +248,8 @@ class CudaPlatformBase(Platform):
@
classmethod
def
get_attn_backend_cls
(
cls
,
selected_backend
:
"
AttentionBackendEnum | None
"
,
attn_selector_config
:
"
AttentionSelectorConfig
"
,
selected_backend
:
AttentionBackendEnum
|
None
,
attn_selector_config
:
AttentionSelectorConfig
,
num_heads
:
int
|
None
=
None
,
)
->
str
:
device_capability
=
cls
.
get_device_capability
()
...
...
@@ -324,7 +341,7 @@ class CudaPlatformBase(Platform):
return
selected_backend
.
get_path
()
@
classmethod
def
get_supported_vit_attn_backends
(
cls
)
->
list
[
"
AttentionBackendEnum
"
]:
def
get_supported_vit_attn_backends
(
cls
)
->
list
[
AttentionBackendEnum
]:
if
cls
.
has_device_capability
(
80
):
return
[
AttentionBackendEnum
.
FLASH_ATTN
,
...
...
@@ -345,8 +362,8 @@ class CudaPlatformBase(Platform):
cls
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
backend
:
"
AttentionBackendEnum | None
"
=
None
,
)
->
"
AttentionBackendEnum
"
:
backend
:
AttentionBackendEnum
|
None
=
None
,
)
->
AttentionBackendEnum
:
if
backend
is
not
None
:
assert
backend
in
cls
.
get_supported_vit_attn_backends
(),
(
f
"Backend
{
backend
}
is not supported for vit attention. "
...
...
@@ -371,7 +388,8 @@ class CudaPlatformBase(Platform):
)
if
is_backend_supported
:
logger
.
info_once
(
f
"Using backend
{
vit_attn_backend
}
for vit attention"
f
"Using backend
{
vit_attn_backend
}
for vit attention"
,
scope
=
"local"
,
)
return
vit_attn_backend
except
ImportError
:
...
...
@@ -493,6 +511,11 @@ class CudaPlatformBase(Platform):
def
support_static_graph_mode
(
cls
)
->
bool
:
return
True
@
classmethod
def
support_deep_gemm
(
cls
)
->
bool
:
"""Currently, only Hopper and Blackwell GPUs are supported."""
return
cls
.
is_device_capability
(
90
)
or
cls
.
is_device_capability_family
(
100
)
@
classmethod
def
num_compute_units
(
cls
,
device_id
:
int
=
0
)
->
int
:
return
torch
.
cuda
.
get_device_properties
(
device_id
).
multi_processor_count
...
...
vllm/platforms/interface.py
View file @
0da93439
...
...
@@ -712,6 +712,13 @@ class Platform:
"""
return
False
@
classmethod
def
support_deep_gemm
(
cls
)
->
bool
:
"""
Returns if DeepGEMM is supported by the current platform.
"""
return
False
@
classmethod
def
use_custom_op_collectives
(
cls
)
->
bool
:
"""
...
...
vllm/platforms/rocm.py
View file @
0da93439
...
...
@@ -28,6 +28,7 @@ try:
from
amdsmi
import
(
AmdSmiException
,
amdsmi_get_gpu_asic_info
,
amdsmi_get_gpu_device_uuid
,
amdsmi_get_processor_handles
,
amdsmi_init
,
amdsmi_shut_down
,
...
...
@@ -439,8 +440,6 @@ class RocmPlatform(Platform):
device_capability
=
cls
.
get_device_capability
()
assert
device_capability
is
not
None
attn_selector_config
=
attn_selector_config
.
_replace
(
block_size
=
None
)
# First try checking just the selected backend, if there is one.
if
selected_backend
is
not
None
:
try
:
...
...
@@ -611,6 +610,20 @@ class RocmPlatform(Platform):
return
_ROCM_DEVICE_ID_NAME_MAP
[
device_name
]
return
asic_info
[
"market_name"
]
@
classmethod
@
with_amdsmi_context
def
get_device_uuid
(
cls
,
device_id
:
int
=
0
)
->
str
:
try
:
device
=
amdsmi_get_processor_handles
()[
device_id
]
except
AmdSmiException
as
error
:
logger
.
error
(
"amdsmi device query failed "
,
exc_info
=
error
)
return
""
try
:
device_uuid
=
amdsmi_get_gpu_device_uuid
(
device
)
except
AmdSmiException
as
error
:
logger
.
error
(
"amdsmi device uuid query failed "
,
exc_info
=
error
)
return
device_uuid
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
device_props
=
torch
.
cuda
.
get_device_properties
(
device_id
)
...
...
@@ -668,7 +681,6 @@ class RocmPlatform(Platform):
def
check_and_update_config
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
from
vllm.config.compilation
import
CUDAGraphMode
cache_config
=
vllm_config
.
cache_config
compilation_config
=
vllm_config
.
compilation_config
parallel_config
=
vllm_config
.
parallel_config
...
...
@@ -690,32 +702,9 @@ class RocmPlatform(Platform):
)
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
if
cache_config
and
not
cache_config
.
user_specified_block_size
:
if
(
envs
.
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
and
envs
.
VLLM_ROCM_USE_AITER
# NOTE: This block has been deprecated
# or get_env_variable_attn_backend()
# == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
# TODO: monitor https://github.com/vllm-project/vllm/pull/30396
# to see how we can transition to the new way of selecting
# attention backends
):
cache_config
.
block_size
=
64
logger
.
warning
(
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
)
else
:
cache_config
.
block_size
=
16
if
parallel_config
.
worker_cls
==
"auto"
:
parallel_config
.
worker_cls
=
"vllm.v1.worker.gpu_worker.Worker"
@
classmethod
def
update_block_size_for_backend
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
# TODO: ROCm still sets block_size in check_and_update_config.
# Move that logic here so block_size is chosen by the backend.
pass
@
classmethod
def
verify_model_arch
(
cls
,
model_arch
:
str
)
->
None
:
if
model_arch
in
_ROCM_UNSUPPORTED_MODELS
:
...
...
Prev
1
…
25
26
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment