Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
639 additions
and
951 deletions
+639
-951
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+115
-190
vllm/model_executor/models/qwen2_cls.py
vllm/model_executor/models/qwen2_cls.py
+0
-104
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+192
-398
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+6
-13
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/telechat2.py
+14
-13
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+60
-60
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+9
-0
vllm/multimodal/__init__.py
vllm/multimodal/__init__.py
+1
-1
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+18
-0
vllm/multimodal/base.py
vllm/multimodal/base.py
+7
-7
vllm/multimodal/image.py
vllm/multimodal/image.py
+12
-0
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+17
-13
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+113
-53
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+3
-3
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+35
-92
vllm/multimodal/video.py
vllm/multimodal/video.py
+32
-0
vllm/outputs.py
vllm/outputs.py
+2
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+1
-1
vllm/scripts.py
vllm/scripts.py
+1
-1
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+1
-1
No files found.
vllm/model_executor/models/qwen2_audio.py
View file @
96ae75ad
...
...
@@ -19,45 +19,43 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
from
functools
import
cached_property
,
lru_cache
from
typing
import
(
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
from
functools
import
cached_property
from
typing
import
(
Any
,
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
import
librosa
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
transformers
import
Qwen2AudioEncoder
from
transformers
import
BatchFeature
,
ProcessorMixin
from
transformers.models.qwen2_audio
import
(
Qwen2AudioConfig
,
Qwen2AudioEncoder
,
Qwen2AudioProcessor
)
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.logger
import
init_logger
from
vllm.inputs
import
InputContext
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
MultiModalDataItems
,
ProcessorInputs
,
PromptReplacement
)
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
logger
=
init_logger
(
__name__
)
# # === Audio Inputs === #
class
Qwen2AudioInputs
(
TypedDict
):
input_features
:
torch
.
Tensor
"""Shape:
`(num_audios, num_mel_bins, 3000)`
"""
"""Shape: `(num_audios, num_mel_bins, 3000)`"""
feature_attention_mask
:
torch
.
Tensor
"""Shape: `(num_audios, 3000)`
"""
"""Shape: `(num_audios, 3000)`"""
# === Audio Encoder === #
...
...
@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module):
return
hidden_states
def
dummy_data_for_qwen2_audio
(
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
]):
num_audios
=
mm_counts
[
"audio"
]
max_tokens_per_audio
=
get_max_qwen2_audio_audio_tokens
(
ctx
)
max_llm_audio_tokens
=
max_tokens_per_audio
*
num_audios
if
seq_len
-
max_llm_audio_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-Audio cannot process
{
num_audios
}
audios in a prompt, "
"please increase max_model_len or reduce audio limit by "
"--limit-mm-per-prompt."
)
audio_token_index
=
ctx
.
model_config
.
hf_config
.
audio_token_index
dummy_seqdata
=
SequenceData
.
from_prompt_token_counts
(
(
audio_token_index
,
max_llm_audio_tokens
),
(
0
,
seq_len
-
max_llm_audio_tokens
),
)
dummy_audio
=
np
.
full
((
max_llm_audio_tokens
*
2
*
2
*
160
,
),
0.
)
return
DummyData
(
dummy_seqdata
,
{
"audio"
:
[(
dummy_audio
,
16000
)]
*
num_audios
},
{
"audio"
:
consecutive_placeholder_ranges
(
num_items
=
num_audios
,
item_size
=
max_tokens_per_audio
)
})
def
get_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets a processor for the given model name via HuggingFace.
Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
"""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
# From Qwen2AudioEncoder._get_feat_extract_output_lengths
def
_get_feat_extract_output_lengths
(
input_lengths
:
torch
.
LongTensor
):
feat_lengths
=
(
input_lengths
-
1
)
//
2
+
1
output_lengths
=
(
feat_lengths
-
2
)
//
2
+
1
return
feat_lengths
,
output_lengths
return
processor
def
get_max_qwen2_audio_audio_tokens
(
ctx
:
InputContext
)
->
int
:
hf_config
=
ctx
.
get_hf_config
(
Qwen2AudioConfig
)
max_source_position
=
hf_config
.
audio_config
.
max_source_positions
output_lengths
=
(
max_source_position
-
2
)
//
2
+
1
return
output_lengths
cached_get_processor
=
lru_cache
(
get_processor
)
class
Qwen2AudioMultiModalProcessor
(
BaseMultiModalProcessor
):
def
_get_feat_extract_output_lengths
(
input_lengths
:
torch
.
LongTensor
):
"""
Computes the output length of the convolutional layers
and the output length of the audio encoder
"""
input_lengths
=
(
input_lengths
-
1
)
//
2
+
1
output_lengths
=
(
input_lengths
-
2
)
//
2
+
1
return
input_lengths
,
output_lengths
def
_get_hf_processor
(
self
)
->
Qwen2AudioProcessor
:
return
self
.
ctx
.
get_hf_processor
(
Qwen2AudioProcessor
)
def
_get_feature_extractor
(
self
)
->
WhisperFeatureExtractor
:
return
self
.
_get_hf_processor
().
feature_extractor
# type: ignore
def
get_max_qwen2_audio_audio_tokens
(
ctx
:
InputContext
)
->
int
:
max_source_position
=
(
ctx
.
model_config
.
hf_config
.
audio_config
.
max_source_positions
)
output_lengths
=
(
max_source_position
-
2
)
//
2
+
1
return
output_lengths
def
_get_processor_data
(
self
,
mm_items
:
MultiModalDataItems
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
# resample audio to the model's sampling rate
feature_extractor
=
self
.
_get_feature_extractor
()
mm_items
.
resample_audios
(
feature_extractor
.
sampling_rate
)
return
super
().
_get_processor_data
(
mm_items
)
def
input_processor_for_qwen2_audio
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
)
->
DecoderOnlyInputs
:
multi_modal_data
=
inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"audio"
not
in
multi_modal_data
:
return
inputs
audios
=
multi_modal_data
[
"audio"
]
if
not
isinstance
(
audios
,
list
):
audios
=
[
audios
]
if
len
(
audios
)
==
0
:
return
inputs
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
resampled_audios
=
[
librosa
.
resample
(
audio
,
orig_sr
=
sampling_rate
,
target_sr
=
processor
.
feature_extractor
.
sampling_rate
)
for
audio
,
sampling_rate
in
audios
]
audio_input_lengths
=
np
.
array
(
[
min
(
3000
,
_
.
shape
[
0
]
//
160
+
1
)
for
_
in
resampled_audios
])
audio_feat_lengths
,
audio_output_lengths
=
_get_feat_extract_output_lengths
(
audio_input_lengths
)
audio_token_index
=
ctx
.
model_config
.
hf_config
.
audio_token_index
input_ids
=
inputs
[
'prompt_token_ids'
]
new_input_ids
=
[]
audio_num
=
input_ids
.
count
(
audio_token_index
)
assert
len
(
audio_input_lengths
)
==
audio_num
,
\
(
f
'The text input contains
{
audio_num
}
audio tokens, '
f
'but
{
len
(
audio_input_lengths
)
}
audios provided'
)
start
=
0
for
audio_idx
in
range
(
audio_num
):
end
=
input_ids
.
index
(
audio_token_index
,
start
)
new_input_ids
.
extend
(
input_ids
[
start
:
end
])
# text part
new_input_ids
.
extend
([
audio_token_index
]
*
audio_output_lengths
[
audio_idx
])
start
=
end
+
1
new_input_ids
.
extend
(
input_ids
[
start
:])
return
token_inputs
(
prompt_token_ids
=
new_input_ids
,
prompt
=
inputs
.
get
(
"prompt"
),
multi_modal_data
=
multi_modal_data
,
)
def
input_mapper_for_qwen2_audio
(
ctx
:
InputContext
,
multi_modal_data
:
Union
[
np
.
ndarray
,
List
[
np
.
ndarray
]],
)
->
MultiModalKwargs
:
"""Input mapper for Qwen2-Audio."""
if
not
isinstance
(
multi_modal_data
,
list
):
multi_modal_data
=
[
multi_modal_data
]
if
len
(
multi_modal_data
)
==
0
:
return
MultiModalKwargs
()
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
audio_feature_extractor
=
processor
.
feature_extractor
if
audio_feature_extractor
is
None
:
raise
RuntimeError
(
"No HuggingFace audio_feature_extractor is available "
"to process the audio object"
)
try
:
resampled_audios
=
[
librosa
.
resample
(
audio
,
orig_sr
=
sampling_rate
,
target_sr
=
processor
.
feature_extractor
.
sampling_rate
)
for
audio
,
sampling_rate
in
multi_modal_data
def
_call_hf_processor
(
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
processor_data
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
processor_data
=
dict
(
processor_data
)
audios
=
processor_data
.
pop
(
"audios"
,
[])
if
audios
:
processor_data
[
"audios"
]
=
audios
feature_extractor
=
self
.
_get_feature_extractor
()
mm_processor_kwargs
=
dict
(
**
mm_processor_kwargs
,
sampling_rate
=
feature_extractor
.
sampling_rate
,
)
else
:
# NOTE: WhisperFeatureExtractor cannot handle empty list of audios
pass
return
super
().
_call_hf_processor
(
hf_processor
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
def
_get_prompt_replacements
(
self
,
mm_items
:
MultiModalDataItems
,
hf_inputs
:
BatchFeature
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
list
[
PromptReplacement
]:
hf_config
=
self
.
ctx
.
get_hf_config
(
Qwen2AudioConfig
)
placeholder
=
hf_config
.
audio_token_index
feature_attention_mask
=
hf_inputs
.
get
(
"feature_attention_mask"
)
if
feature_attention_mask
is
None
:
audio_output_lengths
=
[]
else
:
_
,
audio_output_lengths
=
_get_feat_extract_output_lengths
(
feature_attention_mask
.
sum
(
-
1
))
def
get_replacement_qwen2_audio
(
item_idx
:
int
):
return
[
placeholder
]
*
audio_output_lengths
[
item_idx
]
return
[
PromptReplacement
(
modality
=
"audio"
,
target
=
[
placeholder
],
replacement
=
get_replacement_qwen2_audio
,
)
]
batch_data
=
audio_feature_extractor
(
resampled_audios
,
sampling_rate
=
16000
,
return_attention_mask
=
True
,
padding
=
"max_length"
,
return_tensors
=
"pt"
).
data
batch_data
[
"feature_attention_mask"
]
=
batch_data
.
pop
(
"attention_mask"
)
except
Exception
:
logger
.
error
(
"Failed to process audio (%s)"
,
multi_modal_data
)
raise
return
MultiModalKwargs
(
batch_data
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_qwen2_audio
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_qwen2_audio
)
@
MULTIMODAL_REGISTRY
.
register_input_mapper
(
"audio"
,
input_mapper_for_qwen2_audio
)
def
_get_dummy_mm_inputs
(
self
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
ProcessorInputs
:
feature_extractor
=
self
.
_get_feature_extractor
()
sampling_rate
=
feature_extractor
.
sampling_rate
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
audio_count
=
mm_counts
[
"audio"
]
audio
=
np
.
zeros
(
audio_len
)
data
=
{
"audio"
:
[
audio
]
*
audio_count
}
return
ProcessorInputs
(
prompt_text
=
"<|AUDIO|>"
*
audio_count
,
mm_data
=
data
,
mm_processor_kwargs
=
{},
)
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
"audio"
,
get_max_qwen2_audio_audio_tokens
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Qwen2AudioMultiModalProcessor
)
class
Qwen2AudioForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
...
...
@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return
get_sampler
()
def
_validate_and_reshape_mm_tensor
(
self
,
mm_input
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]],
def
_validate_and_reshape_mm_tensor
(
self
,
mm_input
:
object
,
name
:
str
)
->
torch
.
Tensor
:
if
not
isinstance
(
mm_input
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
f
"Incorrect type of
{
name
}
. "
...
...
vllm/model_executor/models/qwen2_cls.py
deleted
100644 → 0
View file @
f9f4a735
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 Kakao Corp. (Kanana-X Team)
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
"""Inference-only Qwen2-Classification model compatible with HF weights."""
from
typing
import
Iterable
,
List
,
Optional
,
Set
,
Tuple
import
torch
from
torch
import
nn
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
RowParallelLinear
from
vllm.model_executor.layers.pooler
import
Pooler
,
PoolingType
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
AutoWeightsLoader
,
maybe_prefix
class
Qwen2ForSequenceClassification
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
pooler_config
=
vllm_config
.
model_config
.
pooler_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
Qwen2Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
# hidden_states from Qwen2Model has been reduced,
# the input of score layer is not parallelized.
self
.
score
=
RowParallelLinear
(
config
.
hidden_size
,
config
.
num_labels
,
quant_config
=
quant_config
,
input_is_parallel
=
False
,
bias
=
False
,
prefix
=
maybe_prefix
(
prefix
,
"score"
))
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
pooler_config
,
pooling_type
=
PoolingType
.
LAST
,
normalize
=
False
,
softmax
=
True
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
logits
,
_
=
self
.
score
(
hidden_states
)
return
logits
def
pooler
(
self
,
hidden_states
:
torch
.
Tensor
,
pooling_metadata
:
PoolingMetadata
,
)
->
Optional
[
PoolerOutput
]:
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
ignore_unexpected_prefixes
=
[
"lm_head."
])
return
loader
.
load_weights
(
weights
)
vllm/model_executor/models/qwen2_vl.py
View file @
96ae75ad
...
...
@@ -22,28 +22,26 @@
# limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from
functools
import
cached_property
,
partial
from
typing
import
(
Any
,
Callable
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
Type
,
TypedDict
,
Union
)
from
typing
import
(
Any
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
Type
,
TypedDict
,
Union
)
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
PIL
import
Image
from
transformers
.image_utils
import
(
get_image_size
,
infer_channel_dimension_format
,
to_numpy_array
)
from
transformers
import
BatchFeature
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
Qwen2VLProcessor
)
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
Qwen2VLConfig
,
Qwen2VLVisionConfig
)
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
(
make_batched_images
,
make_batched_videos
,
smart_resize
)
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.inputs
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
QuickGELU
...
...
@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.i
mage
import
cached_get_image_proces
sor
from
vllm.multimodal.
inputs
import
(
MultiModal
Data
,
MultiModalDataDict
,
MultiModal
Kwargs
,
NestedTensors
)
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.i
nputs
import
MultiModalDataDict
,
NestedTen
sor
s
from
vllm.multimodal.
processing
import
(
Base
MultiModal
Processor
,
MultiModal
DataItems
,
ProcessorInputs
,
PromptReplacement
)
from
vllm.platforms
import
_Backend
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.
transformers_utils.processor
import
cached_get_processor
from
vllm.
utils
import
is_list_of
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
get_vit_attn_backend
,
...
...
@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module):
def
__init__
(
self
,
in_features
:
int
,
hidden_features
:
int
=
None
,
hidden_features
:
int
,
act_layer
:
Type
[
nn
.
Module
]
=
QuickGELU
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
...
...
@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module):
# === Vision input helpers === #
def
get_mm_processor_kwargs
(
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
)
->
Dict
[
str
,
int
]:
mm_processor_kwargs
=
{}
if
min_pixels
:
mm_processor_kwargs
[
"min_pixels"
]
=
min_pixels
if
max_pixels
:
mm_processor_kwargs
[
"max_pixels"
]
=
max_pixels
return
mm_processor_kwargs
def
mm_input_mapper_for_qwen2_vl
(
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
data_type_key
:
str
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
MultiModalKwargs
:
"""Input mapper for Qwen2-VL."""
if
data_type_key
==
"image"
and
isinstance
(
data
,
dict
):
return
MultiModalKwargs
({
"image_embeds"
:
data
.
get
(
"image_embeds"
),
"image_grid_thw"
:
data
.
get
(
"image_grid_thw"
),
})
if
data_type_key
==
"video"
and
isinstance
(
data
,
dict
):
return
MultiModalKwargs
({
"video_embeds"
:
data
.
get
(
"video_embeds"
),
"video_grid_thw"
:
data
.
get
(
"video_grid_thw"
),
})
model_config
=
ctx
.
model_config
# Handle mm processor kwargs; we pass these at creation time
# because preprocess() in transformers doesn't expose them
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
cached_get_image_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
mm_processor_kwargs
,
)
if
image_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the image object"
)
images
=
None
videos
=
None
if
data_type_key
==
"image"
:
images
=
data
else
:
assert
data_type_key
==
"video"
videos
=
data
try
:
batch_data
=
image_processor
\
.
preprocess
(
images
=
images
,
videos
=
videos
,
return_tensors
=
"pt"
)
\
.
data
except
Exception
:
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModalKwargs
(
batch_data
)
image_input_mapper_for_qwen2_vl
=
partial
(
mm_input_mapper_for_qwen2_vl
,
data_type_key
=
"image"
)
video_input_mapper_for_qwen2_vl
=
partial
(
mm_input_mapper_for_qwen2_vl
,
data_type_key
=
"video"
)
def
_get_vision_info
(
image_processor
,
vision_config
:
Qwen2VLVisionConfig
,
height
:
int
,
width
:
int
,
min_pixels
:
int
,
...
...
@@ -775,12 +703,15 @@ def _get_vision_info(
):
"""Get information (resized height / width and number of vision tokens)
of input image / video frame."""
patch_size
=
vision_config
.
patch_size
merge_size
=
vision_config
.
spatial_merge_size
temporal_patch_size
=
vision_config
.
temporal_patch_size
if
do_resize
:
resized_height
,
resized_width
=
smart_resize
(
height
=
height
,
width
=
width
,
factor
=
image_processor
.
patch_size
*
image_processor
.
merge_size
,
factor
=
patch_size
*
merge_size
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
)
...
...
@@ -791,54 +722,41 @@ def _get_vision_info(
grid_t
=
mm_count
else
:
assert
data_type_key
==
"video"
grid_t
=
max
(
mm_count
//
image_processor
.
temporal_patch_size
,
1
)
grid_t
=
max
(
mm_count
//
temporal_patch_size
,
1
)
grid_h
=
resized_height
//
image_processor
.
patch_size
grid_w
=
resized_width
//
image_processor
.
patch_size
grid_h
=
resized_height
//
patch_size
grid_w
=
resized_width
//
patch_size
vision_tokens
=
grid_t
*
grid_h
*
grid_w
llm_num_vision_tokens
=
(
vision_tokens
//
image_processor
.
merge_size
//
image_processor
.
merge_size
)
llm_num_vision_tokens
=
vision_tokens
//
(
merge_size
**
2
)
return
resized_height
,
resized_width
,
llm_num_vision_tokens
def
_get_max_image_info
(
image_processor
,
data_type_key
:
str
=
"image"
,
mm_count
:
int
=
1
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
):
# Limit min / max pixels unless they're explicitly provided
if
min_pixels
is
None
:
min_pixels
=
max
(
image_processor
.
min_pixels
,
28
*
28
)
if
max_pixels
is
None
:
max_pixels
=
min
(
image_processor
.
max_pixels
,
1280
*
28
*
28
)
return
_get_vision_info
(
image_processor
,
height
=
9999999
,
width
=
9999999
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
data_type_key
=
data_type_key
,
mm_count
=
mm_count
,
)
def
_get_image_processor
(
hf_processor
:
Qwen2VLProcessor
):
image_processor
=
hf_processor
.
image_processor
# type: ignore
assert
isinstance
(
image_processor
,
Qwen2VLImageProcessor
)
return
image_processor
def
get_max_qwen2_vl_mm_tokens
(
ctx
:
InputContext
,
data_type_key
:
str
,
*
,
min_pixels
=
None
,
max_pixels
=
None
)
->
int
:
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
cached_get_image_processor
(
ctx
.
model_config
.
model
,
**
mm_processor_kwargs
)
max_resized_height
,
max_resized_width
,
max_llm_image_tokens
=
\
_get_max_image_info
(
image_processor
,
data_type_key
=
data_type_key
,
mm_count
=
1
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
)
->
int
:
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
vision_config
=
hf_config
.
vision_config
hf_processor
=
ctx
.
get_hf_processor
(
Qwen2VLProcessor
)
image_processor
=
_get_image_processor
(
hf_processor
)
_
,
_
,
max_llm_image_tokens
=
_get_vision_info
(
vision_config
,
height
=
9999999
,
width
=
9999999
,
min_pixels
=
min_pixels
or
image_processor
.
min_pixels
,
max_pixels
=
max_pixels
or
image_processor
.
max_pixels
,
data_type_key
=
data_type_key
,
)
return
max_llm_image_tokens
...
...
@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
data_type_key
=
"video"
)
def
dummy_data_for_qwen2_vl
(
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
)
->
Tuple
[
SequenceData
,
Optional
[
MultiModalDataDict
]]:
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
cached_get_image_processor
(
ctx
.
model_config
.
model
,
**
mm_processor_kwargs
)
num_images
=
mm_counts
[
"image"
]
max_resized_height
,
max_resized_width
,
max_llm_image_tokens
=
\
_get_max_image_info
(
image_processor
,
data_type_key
=
"image"
,
mm_count
=
num_images
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
if
seq_len
-
max_llm_image_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-VL cannot process
{
num_images
}
images in a prompt, "
"please increase max_model_len or reduce image limit by "
"--limit-mm-per-prompt."
)
# Check video counts.
num_videos
=
mm_counts
[
"video"
]
max_resized_height
,
max_resized_width
,
max_llm_video_tokens
=
\
_get_max_image_info
(
image_processor
,
data_type_key
=
"video"
,
mm_count
=
num_videos
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
if
seq_len
-
max_llm_video_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-VL cannot process
{
num_videos
}
videos in a prompt, "
"please increase max_model_len or reduce video limit by "
"--limit-mm-per-prompt."
)
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
dummy_seqdata
=
SequenceData
.
from_prompt_token_counts
(
(
hf_config
.
vision_start_token_id
,
1
),
(
hf_config
.
image_token_id
,
max_llm_image_tokens
),
(
hf_config
.
vision_end_token_id
,
1
),
(
0
,
seq_len
-
max_llm_image_tokens
-
2
),
)
dummy_image
=
Image
.
new
(
"RGB"
,
(
max_resized_width
,
max_resized_height
),
color
=
0
)
return
DummyData
(
dummy_seqdata
,
{
"image"
:
dummy_image
if
num_images
==
1
else
[
dummy_image
]
*
num_images
})
class
Qwen2VLMultiModalDataItems
(
MultiModalDataItems
):
@
staticmethod
def
from_dict
(
data
:
MultiModalDataDict
)
->
"MultiModalDataItems"
:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data
=
Qwen2VLMultiModalDataItems
()
for
k
,
v
in
data
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
(
isinstance
(
v
,
(
dict
,
torch
.
Tensor
))
# type: ignore[assignment]
or
is_list_of
(
v
,
list
))
else
[
v
]
)
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
isinstance
(
v
,
(
dict
,
torch
.
Tensor
,
list
))
else
[
v
]
)
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
def
_get_llm_num_vision_tokens
(
mm_inputs
:
list
,
data_type_key
:
str
,
image_processor
,
min_pixels
:
int
,
max_pixels
:
int
,
):
"""Get number of vision tokens of multimodal inputs.
return
multi_data
This method is derived from `transformers.models.qwen2_vl.
image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
"""
image
=
to_numpy_array
(
mm_inputs
[
0
])
input_data_format
=
infer_channel_dimension_format
(
image
)
height
,
width
=
get_image_size
(
image
,
channel_dim
=
input_data_format
)
_
,
_
,
llm_num_vision_tokens
=
_get_vision_info
(
image_processor
,
height
=
height
,
width
=
width
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
do_resize
=
image_processor
.
do_resize
,
data_type_key
=
data_type_key
,
mm_count
=
len
(
mm_inputs
),
)
return
llm_num_vision_tokens
def
get_item_counts
(
self
)
->
Mapping
[
str
,
int
]:
return
{
m
:
(
len
(
items
[
f
"
{
m
}
_grid_thw"
])
# type: ignore
if
isinstance
(
items
,
dict
)
else
len
(
items
))
for
m
,
items
in
self
.
items
()
}
def
_expand_pad_tokens
(
inputs
:
list
,
token_id
:
int
,
make_batched_fn
:
Callable
,
data_type_key
:
str
,
image_processor
:
Any
,
prompt_token_ids
:
List
[
int
],
min_pixels
:
Optional
[
int
],
max_pixels
:
Optional
[
int
])
->
List
[
int
]:
"""
Expand pad tokens for multi-modal inputs (e.g., images or videos).
Args:
inputs (list): The multi-modal inputs (e.g., images or videos).
token_id (int): The token ID used to represent the multi-modal input.
make_batched_fn (Callable): A function to batch the inputs.
data_type_key (str): The type of the multi-modal input.
image_processor (Any): The image processor used to process the inputs.
prompt_token_ids (List[int]): The list of token IDs in the prompt.
min_pixels (int): min pixels to used for img processing
max_pixels (int): max pixels to be used for img processing
Returns:
List[int]: The list of token IDs for the multi-modal inputs.
"""
indices
=
[
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
if
token
==
token_id
]
inputs
=
make_batched_fn
(
inputs
)
assert
len
(
indices
)
==
len
(
inputs
)
prompt_token_ids_with_data
=
[]
for
cnt
,
data
in
enumerate
(
inputs
):
num_tokens
=
_get_llm_num_vision_tokens
(
[
data
]
if
data_type_key
==
"image"
else
data
,
data_type_key
=
data_type_key
,
image_processor
=
image_processor
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
)
if
cnt
==
0
:
end_idx
=
indices
[
cnt
]
non_data_tokens
=
prompt_token_ids
[:
end_idx
]
else
:
non_data_tokens
=
prompt_token_ids
[
indices
[
cnt
-
1
]
+
1
:
indices
[
cnt
]]
prompt_token_ids_with_data
.
extend
(
non_data_tokens
)
prompt_token_ids_with_data
.
extend
(
token_id
for
_
in
range
(
num_tokens
))
prompt_token_ids_with_data
.
extend
(
prompt_token_ids
[
indices
[
-
1
]
+
1
:])
return
prompt_token_ids_with_data
def
input_processor_for_qwen2_vl
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
DecoderOnlyInputs
:
multi_modal_data
=
inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
:
return
inputs
image_inputs
=
multi_modal_data
.
get
(
"image"
,
None
)
video_inputs
=
multi_modal_data
.
get
(
"video"
,
None
)
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
image_processor
=
processor
.
image_processor
# Apply processor kwarg overrides for image processor options
min_pixels
=
min_pixels
if
min_pixels
else
image_processor
.
min_pixels
max_pixels
=
max_pixels
if
max_pixels
else
image_processor
.
max_pixels
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
class
Qwen2VLMultiModalProcessor
(
BaseMultiModalProcessor
):
# To avoid redundant processing of vision objects (resize, rescale, etc.),
# we extract code of calculating number of vision tokens from
# `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
#
# The following code is equivalent to:
# prompt = inputs["prompt"]
# inputs = processor(text=[prompt],
# images=image_inputs,
# videos=video_inputs,
# padding=True,
# return_tensors="pt")
# prompt_token_ids = inputs["input_ids"][0].tolist()
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
)
prompt_token_ids
=
inputs
[
"prompt_token_ids"
]
# Expand image pad tokens.
if
image_inputs
is
not
None
:
if
isinstance
(
image_inputs
,
dict
):
prompt_token_ids_with_image
=
[]
image_indices
=
[
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
if
token
==
hf_config
.
image_token_id
]
def
_get_mm_items
(
self
,
mm_data
:
MultiModalDataDict
,
)
->
MultiModalDataItems
:
return
Qwen2VLMultiModalDataItems
.
from_dict
(
mm_data
)
# ensure all image tokens have grid_thw
assert
\
len
(
image_indices
)
==
image_inputs
[
"image_grid_thw"
].
size
(
0
),
\
"image token num does not match image_grid_thw.shape"
image_counter
=
0
pad_token_counter
=
0
for
idx
,
token
in
enumerate
(
prompt_token_ids
):
if
idx
in
image_indices
:
grid_thw
=
image_inputs
[
"image_grid_thw"
][
image_counter
]
grid_t
,
grid_h
,
grid_w
=
grid_thw
num_pad_tokens
=
(
grid_t
*
grid_h
*
grid_w
//
image_processor
.
merge_size
//
image_processor
.
merge_size
)
prompt_token_ids_with_image
.
extend
([
token
]
*
num_pad_tokens
)
image_counter
+=
1
pad_token_counter
+=
num_pad_tokens
def
_get_hf_processor
(
self
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
Qwen2VLProcessor
:
hf_processor
=
self
.
ctx
.
get_hf_processor
(
Qwen2VLProcessor
)
image_processor
=
_get_image_processor
(
hf_processor
)
if
min_pixels
:
image_processor
.
min_pixels
=
min_pixels
if
max_pixels
:
image_processor
.
max_pixels
=
max_pixels
if
max_pixels
or
min_pixels
:
image_processor
.
size
=
{
"min_pixels"
:
image_processor
.
min_pixels
,
"max_pixels"
:
image_processor
.
max_pixels
,
}
return
hf_processor
def
_get_processor_data
(
self
,
mm_items
:
MultiModalDataItems
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
processor_data
=
dict
[
str
,
Any
]()
passthrough_data
=
dict
[
str
,
Any
]()
for
k
,
v
in
mm_items
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
if
k
in
(
"image"
,
"video"
,
"audio"
):
if
isinstance
(
v
,
dict
):
# Pass through embedding inputs (dict)
passthrough_data
.
update
(
v
)
elif
isinstance
(
v
,
torch
.
Tensor
)
and
v
.
ndim
==
3
:
# Pass through embedding inputs (single)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
[
v
]
elif
(
is_list_of
(
v
,
torch
.
Tensor
)
and
len
(
v
)
>
0
and
v
[
0
].
ndim
==
2
):
# Pass through embedding inputs (multi)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
v
else
:
prompt_token_ids_with_image
.
append
(
token
)
# Map keys to plural form, e.g.: image -> images
processor_data
[
f
"
{
k
}
s"
]
=
v
else
:
processor_data
[
k
]
=
v
# ensure all embeddings are used
assert
\
pad_token_counter
==
image_inputs
[
"image_embeds"
].
size
(
0
),
\
"image_embeds.shape does not match image_grid_thw"
return
processor_data
,
passthrough_data
prompt_token_ids
=
prompt_token_ids_with_image
else
:
prompt_token_ids
=
_expand_pad_tokens
(
image_inputs
,
hf_config
.
image_token_id
,
make_batched_images
,
"image"
,
image_processor
,
prompt_token_ids
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
if
video_inputs
is
not
None
:
if
isinstance
(
video_inputs
,
dict
):
prompt_token_ids_with_video
=
[]
video_indices
=
[
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
if
token
==
hf_config
.
video_token_id
]
def
_get_prompt_replacements
(
self
,
mm_items
:
MultiModalDataItems
,
hf_inputs
:
BatchFeature
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
list
[
PromptReplacement
]:
hf_processor
=
self
.
_get_hf_processor
()
image_processor
=
_get_image_processor
(
hf_processor
)
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder
=
{
"image"
:
hf_processor
.
image_token
,
"video"
:
hf_processor
.
video_token
,
}
merge_length
=
image_processor
.
merge_size
**
2
def
get_replacement_qwen2vl
(
item_idx
:
int
,
modality
:
str
):
grid_thw
=
hf_inputs
[
f
"
{
modality
}
_grid_thw"
][
item_idx
]
num_tokens
=
grid_thw
.
prod
()
//
merge_length
return
placeholder
[
modality
]
*
num_tokens
return
[
PromptReplacement
(
modality
=
modality
,
target
=
placeholder
[
modality
],
replacement
=
partial
(
get_replacement_qwen2vl
,
modality
=
modality
),
)
for
modality
in
(
"image"
,
"video"
)
]
# ensure all video tokens have grid_thw
assert
\
len
(
video_indices
)
==
video_inputs
[
"video_grid_thw"
].
size
(
0
),
\
"video token num does not match video_grid_thw.shape"
video_counter
=
0
pad_token_counter
=
0
for
idx
,
token
in
enumerate
(
prompt_token_ids
):
if
idx
in
video_indices
:
grid_thw
=
video_inputs
[
"video_grid_thw"
][
video_counter
]
grid_t
,
grid_h
,
grid_w
=
grid_thw
num_pad_tokens
=
(
grid_t
*
grid_h
*
grid_w
//
image_processor
.
merge_size
//
image_processor
.
merge_size
)
prompt_token_ids_with_video
.
extend
([
token
]
*
num_pad_tokens
)
video_counter
+=
1
pad_token_counter
+=
num_pad_tokens
else
:
prompt_token_ids_with_video
.
append
(
token
)
def
_get_dummy_mm_inputs
(
self
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
ProcessorInputs
:
num_images
=
mm_counts
[
"image"
]
hf_processor
=
self
.
_get_hf_processor
()
image_token
:
str
=
hf_processor
.
image_token
image_processor
=
_get_image_processor
(
hf_processor
)
data
=
{}
resized_height
,
resized_width
=
smart_resize
(
height
=
9999999
,
width
=
9999999
,
factor
=
image_processor
.
patch_size
*
image_processor
.
merge_size
,
min_pixels
=
image_processor
.
min_pixels
,
max_pixels
=
image_processor
.
max_pixels
,
)
# ensure all embeddings are used
assert
\
pad_token_counter
==
video_inputs
[
"video_embeds"
].
size
(
0
),
\
"video_embeds.shape does not match video_grid_thw"
dummy_image
=
Image
.
new
(
"RGB"
,
(
resized_width
,
resized_height
),
color
=
0
)
data
[
"image"
]
=
[
dummy_image
]
*
num_images
prompt_token_ids
=
prompt_token_ids_with_video
else
:
prompt_token_ids
=
_expand_pad_tokens
(
video_inputs
,
hf_config
.
video_token_id
,
make_batched_videos
,
"video"
,
image_processor
,
prompt_token_ids
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
prompt
=
inputs
.
get
(
"prompt"
)
if
prompt
is
None
:
prompt
=
tokenizer
.
decode
(
prompt_token_ids
)
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
prompt
=
prompt
,
multi_modal_data
=
multi_modal_data
,
)
return
ProcessorInputs
(
prompt_text
=
image_token
*
num_images
,
mm_data
=
data
,
mm_processor_kwargs
=
{},
)
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
(
image_input_mapper_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_input_mapper
(
"video"
,
video_input_mapper_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_qwen2_vl_image_tokens
)
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
"video"
,
get_max_qwen2_vl_video_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_qwen2_vl
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Qwen2VLMultiModalProcessor
)
class
Qwen2VLForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
...
...
@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
:
Qwen2VLConfig
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
...
...
@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/registry.py
View file @
96ae75ad
...
...
@@ -20,11 +20,10 @@ import torch.nn as nn
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
.adapters
import
as_embedding_model
from
.interfaces
import
(
has_inner_state
,
is_attention_free
,
is_hybrid
,
supports_cross_encoding
,
supports_multimodal
,
supports_pp
)
from
.interfaces_base
import
is_pooling_model
,
is_text_generation_model
from
.interfaces_base
import
is_text_generation_model
logger
=
init_logger
(
__name__
)
...
...
@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = {
"DeciLMForCausalLM"
:
(
"decilm"
,
"DeciLMForCausalLM"
),
"DeepseekForCausalLM"
:
(
"deepseek"
,
"DeepseekForCausalLM"
),
"DeepseekV2ForCausalLM"
:
(
"deepseek_v2"
,
"DeepseekV2ForCausalLM"
),
"DeepseekV3ForCausalLM"
:
(
"deepseek_v3"
,
"DeepseekV3ForCausalLM"
),
"ExaoneForCausalLM"
:
(
"exaone"
,
"ExaoneForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"GemmaForCausalLM"
:
(
"gemma"
,
"GemmaForCausalLM"
),
...
...
@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
"Gemma2Model"
:
(
"gemma2"
,
"Gemma2ForCausalLM"
),
"GlmForCausalLM"
:
(
"glm"
,
"GlmForCausalLM"
),
"GritLM"
:
(
"gritlm"
,
"GritLM"
),
"JambaForSequenceClassification"
:
(
"jamba"
,
"JambaForSequenceClassification"
),
# noqa: E501
"LlamaModel"
:
(
"llama"
,
"LlamaForCausalLM"
),
**
{
# Multiple models share the same architecture, so we include them all
...
...
@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = {
"Qwen2Model"
:
(
"qwen2"
,
"Qwen2EmbeddingModel"
),
"Qwen2ForCausalLM"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
"Qwen2ForRewardModel"
:
(
"qwen2_rm"
,
"Qwen2ForRewardModel"
),
"Qwen2ForSequenceClassification"
:
(
"qwen2_cls"
,
"Qwen2ForSequenceClassification"
),
# noqa: E501
"TeleChat2ForCausalLM"
:
(
"telechat2"
,
"TeleChat2ForCausalLM"
),
# [Multimodal]
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# noqa: E501
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
# [Auto-converted (see adapters.py)]
"Qwen2ForSequenceClassification"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
}
_CROSS_ENCODER_MODELS
=
{
...
...
@@ -225,19 +227,10 @@ class _ModelInfo:
@
staticmethod
def
from_model_cls
(
model
:
Type
[
nn
.
Module
])
->
"_ModelInfo"
:
is_pooling_model_
=
is_pooling_model
(
model
)
if
not
is_pooling_model_
:
try
:
as_embedding_model
(
model
)
except
Exception
:
pass
else
:
is_pooling_model_
=
True
return
_ModelInfo
(
architecture
=
model
.
__name__
,
is_text_generation_model
=
is_text_generation_model
(
model
),
is_pooling_model
=
is_
pooling
_
model
_
,
is_pooling_model
=
True
,
# Can convert any model into a
pooling
model
supports_cross_encoding
=
supports_cross_encoding
(
model
),
supports_multimodal
=
supports_multimodal
(
model
),
supports_pp
=
supports_pp
(
model
),
...
...
vllm/model_executor/models/telechat2.py
View file @
96ae75ad
...
...
@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
class
TeleChat2Model
(
LlamaModel
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"transformer."
:
"model."
,
},
orig_to_new_substr
=
{
".h."
:
".layers."
,
".self_attention."
:
".self_attn."
,
".word_embeddings."
:
".embed_tokens."
,
".dense."
:
".o_proj."
,
".ln_f."
:
".norm."
,
},
)
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
# 1. Initialize the LlamaModel with bias
vllm_config
.
model_config
.
hf_config
.
bias
=
True
...
...
@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"transformer."
:
"model."
,
},
orig_to_new_substr
=
{
".h."
:
".layers."
,
".self_attention."
:
".self_attn."
,
".word_embeddings."
:
".embed_tokens."
,
".dense."
:
".o_proj."
,
".ln_f."
:
".norm."
,
},
)
loader
=
AutoWeightsLoader
(
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/ultravox.py
View file @
96ae75ad
...
...
@@ -3,7 +3,7 @@
import
math
from
functools
import
cached_property
,
lru_cache
from
typing
import
(
Any
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
from
typing
import
(
Any
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
import
numpy
as
np
...
...
@@ -11,7 +11,7 @@ import torch
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
transformers
import
BatchFeature
from
transformers
import
BatchFeature
,
ProcessorMixin
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
transformers.models.whisper.modeling_whisper
import
WhisperEncoder
...
...
@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
NestedTensors
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
MultiModalDataDict
,
MultiModalDataItems
,
ProcessorInputs
,
PromptReplacement
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
...
...
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
def
whisper_feature_extractor
(
ctx
:
InputContext
)
->
WhisperFeatureExtractor
:
return
cached_feature_extractor
(
ctx
.
get_hf_config
(
UltravoxC
onfig
)
.
audio_model_id
)
hf_config
=
ctx
.
get_hf_config
(
UltravoxConfig
)
return
cached_feature_extractor
(
hf_c
onfig
.
audio_model_id
)
def
get_ultravox_max_audio_tokens
(
ctx
:
InputContext
):
...
...
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
class
UltravoxMultiModalProcessor
(
BaseMultiModalProcessor
):
def
_get_feature_extractor
(
self
)
->
WhisperFeatureExtractor
:
return
self
.
_get_hf_processor
().
audio_processor
.
feature_extractor
hf_processor
=
self
.
_get_hf_processor
()
return
hf_processor
.
audio_processor
.
feature_extractor
# type: ignore
def
_
resample_audio
(
def
_
get_processor_data
(
self
,
audio
:
np
.
ndarray
,
sr
:
int
,
)
->
Dict
[
str
,
Union
[
np
.
ndarray
,
int
]]:
mm_items
:
MultiModalDataItems
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
# resample audio to the model's sampling rate
feature_extractor
=
self
.
_get_feature_extractor
()
if
sr
!=
feature_extractor
.
sampling_rate
:
try
:
import
librosa
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[audio] for audio support."
)
from
exc
audio
=
librosa
.
resample
(
audio
,
orig_sr
=
sr
,
target_sr
=
feature_extractor
.
sampling_rate
)
sr
=
feature_extractor
.
sampling_rate
return
{
"audio"
:
audio
,
"sampling_rate"
:
sr
}
def
_apply_hf_processor
(
mm_items
.
resample_audios
(
feature_extractor
.
sampling_rate
)
return
super
().
_get_processor_data
(
mm_items
)
def
_call_hf_processor
(
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
mm_data
:
MultiModalDataDi
ct
,
processor_data
:
Mapping
[
str
,
obje
ct
]
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
if
not
mm_data
or
not
mm_data
.
get
(
"audio"
,
None
):
return
super
().
_apply_hf_processor
(
prompt
,
mm_data
,
mm_processor_kwargs
)
processor_data
=
dict
(
processor_data
)
audios
=
processor_data
.
pop
(
"audios"
,
[])
if
not
audios
:
return
super
().
_call_hf_processor
(
hf_processor
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
feature_extractor
=
self
.
_get_feature_extractor
()
mm_processor_kwargs
=
dict
(
**
mm_processor_kwargs
,
sampling_rate
=
feature_extractor
.
sampling_rate
,
)
audio_data
=
mm_data
[
"audio"
]
if
not
isinstance
(
audio_data
,
list
):
audio_data
=
[
audio_data
]
# Already resampled by _get_processor_data
assert
is_list_of
(
audios
,
np
.
ndarray
)
# Ultravox processor doesn't support multiple inputs,
# therefore we need to input text and audio one by one
tokenizer
=
self
.
_get_tokenizer
()
audio_features
,
audio_token_len
=
[],
[]
processed_inputs
=
{}
for
audio
,
sr
in
audio_data
:
data
=
self
.
_resample_audio
(
audio
,
sr
)
processed_inputs
=
super
().
_apply_hf_processor
(
prompt
,
data
,
mm_processor_kwargs
)
prompt
=
tokenizer
.
decode
(
processed_inputs
[
"input_ids"
][
0
],
skip_special_tokens
=
False
)
audio_features
.
append
(
processed_inputs
.
pop
(
"audio_values"
).
squeeze
(
0
))
audio_token_len
.
append
(
processed_inputs
.
pop
(
"audio_token_len"
).
item
())
return
dict
(
**
processed_inputs
,
shared_outputs
=
{}
for
audio
in
audios
:
# NOTE: Ultravox processor accepts "audio" instead of "audios"
item_processor_data
=
dict
(
**
processor_data
,
audio
=
audio
)
item_outputs
=
super
().
_call_hf_processor
(
hf_processor
,
prompt
=
prompt
,
processor_data
=
item_processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
audio_features
.
append
(
item_outputs
.
pop
(
"audio_values"
)[
0
])
audio_token_len
.
append
(
item_outputs
.
pop
(
"audio_token_len"
).
item
())
shared_outputs
=
item_outputs
combined_outputs
=
dict
(
**
shared_outputs
,
audio_features
=
audio_features
,
audio_token_len
=
audio_token_len
,
)
def
_get_processor_data
(
self
,
mm_data
:
MultiModalDataDict
,
)
->
Tuple
[
Dict
[
str
,
Any
],
Dict
[
str
,
Any
]]:
# Ultravox uses "audio" instead of "audios" as calling keyword
processor_data
,
passthrough_data
=
super
().
_get_processor_data
(
mm_data
)
if
"audios"
in
processor_data
:
processor_data
[
"audio"
]
=
processor_data
.
pop
(
"audios"
)
return
processor_data
,
passthrough_data
return
BatchFeature
(
combined_outputs
)
def
_get_prompt_replacements
(
self
,
...
...
@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
list
[
PromptReplacement
]:
hf_processor
=
self
.
_get_hf_processor
()
placeholder
=
hf_processor
.
audio_token_replacement
placeholder
=
hf_processor
.
audio_token_replacement
# type: ignore
def
get_replacement_ultravox
(
item_idx
:
int
):
audio_token_len
=
hf_inputs
[
"audio_token_len"
][
item_idx
]
...
...
@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
audio_count
=
mm_counts
[
"audio"
]
audio
=
np
.
zeros
(
audio_len
)
data
=
{
"audio"
:
[
(
audio
,
sampling_rate
)
]
*
audio_count
}
data
=
{
"audio"
:
[
audio
]
*
audio_count
}
return
ProcessorInputs
(
prompt_text
=
"<|audio|>"
*
audio_count
,
...
...
@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
@
MULTIMODAL_REGISTRY
.
register_processor
(
UltravoxMultiModalProcessor
)
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
loader
=
AutoWeightsLoader
(
self
,
ignore_unexpected_prefixes
=
[
"audio_tower."
])
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
\ No newline at end of file
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/parameter.py
View file @
96ae75ad
...
...
@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
marlin_tile_size
=
self
.
marlin_tile_size
)
class
BlockQuantScaleParameter
(
_ColumnvLLMParameter
,
RowvLLMParameter
):
"""
Parameter class for weight scales loaded for weights with
block-wise quantization. Uses both column and row parallelism.
"""
pass
def
permute_param_layout_
(
param
:
BasevLLMParameter
,
input_dim
:
int
,
output_dim
:
int
,
**
kwargs
)
->
BasevLLMParameter
:
"""
...
...
vllm/multimodal/__init__.py
View file @
96ae75ad
...
...
@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to
dispatch data processing according to its modality and the target model.
See also:
:ref:`input
_
processing
_
pipeline`
:ref:`input
-
processing
-
pipeline`
"""
__all__
=
[
...
...
vllm/multimodal/audio.py
View file @
96ae75ad
import
numpy
as
np
import
numpy.typing
as
npt
from
vllm.inputs.registry
import
InputContext
from
vllm.utils
import
PlaceholderModule
from
.base
import
MultiModalPlugin
from
.inputs
import
AudioItem
,
MultiModalData
,
MultiModalKwargs
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
class
AudioPlugin
(
MultiModalPlugin
):
"""Plugin for audio data."""
...
...
@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
raise
NotImplementedError
(
"There is no default maximum multimodal tokens"
)
def
resample_audio
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
librosa
.
resample
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
vllm/multimodal/base.py
View file @
96ae75ad
...
...
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
(i.e., the modality of the data).
See also:
:ref:`adding
_
multimodal
_
plugin`
:ref:`adding
-
multimodal
-
plugin`
"""
def
__init__
(
self
)
->
None
:
...
...
@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default input mapper is used instead.
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
-
multimodal
-
inputs`
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
...
...
@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC):
TypeError: If the data type is not supported.
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
-
multimodal
-
inputs`
"""
# Avoid circular import
...
...
@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default calculation is used instead.
See also:
:ref:`enabling
_
multimodal
_
inputs`
:ref:`enabling
-
multimodal
-
inputs`
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
...
...
@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC):
The model is identified by ``model_config``.
See also:
:ref:`enabling
_
multimodal
_
inputs`
:ref:`enabling
-
multimodal
-
inputs`
"""
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
...
...
vllm/multimodal/image.py
View file @
96ae75ad
...
...
@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
3000
def
rescale_image_size
(
image
:
Image
.
Image
,
size_factor
:
float
,
transpose
:
int
=
-
1
)
->
Image
.
Image
:
"""Rescale the dimensions of an image by a constant factor."""
new_width
=
int
(
image
.
width
*
size_factor
)
new_height
=
int
(
image
.
height
*
size_factor
)
image
=
image
.
resize
((
new_width
,
new_height
))
if
transpose
>=
0
:
image
=
image
.
transpose
(
Image
.
Transpose
(
transpose
))
return
image
vllm/multimodal/inputs.py
View file @
96ae75ad
...
...
@@ -15,31 +15,32 @@ _T = TypeVar("_T")
# yapf: disable
ImageItem
:
TypeAlias
=
Union
[
Image
,
np
.
ndarray
,
torch
.
Tensor
]
"""
A :class:`transformers.image_utils.ImageInput` representing a single image
,
which can be passed to a HuggingFace :code:`ImageProcessor`.
A :class:`transformers.image_utils.ImageInput` representing a single image
item,
which can be passed to a HuggingFace :code:`ImageProcessor`.
"""
VideoItem
:
TypeAlias
=
Union
[
L
ist
[
Image
],
l
ist
[
Image
],
np
.
ndarray
,
torch
.
Tensor
,
L
ist
[
np
.
ndarray
],
L
ist
[
torch
.
Tensor
],
l
ist
[
np
.
ndarray
],
l
ist
[
torch
.
Tensor
],
]
"""
A :class:`transformers.image_utils.VideoInput` representing a single video,
which can be passed to a HuggingFace :code:`VideoProcessor`.
A :class:`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
"""
AudioItem
:
TypeAlias
=
Union
[
np
.
ndarray
,
List
[
float
],
Tuple
[
np
.
ndarray
,
float
],
# DEPRECATED: Use mm_processor_kwargs instead
list
[
float
],
# `(audio, sampling_rate)`: If the audio's sampling rate is different
# from that expected by the model, we need to resample it.
tuple
[
np
.
ndarray
,
float
],
]
"""
Represents a single audio
that can be inputted to a HuggingFace
:code:`AudioProcessor`.
Represents a single audio
item, which can be passed to a HuggingFace
:code:`AudioProcessor`.
"""
# yapf: enable
...
...
@@ -74,7 +75,7 @@ Note:
This dictionary also accepts modality keys defined outside
:class:`MultiModalDataBuiltins` as long as a customized plugin
is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
Read more on that :ref:`here <adding
_
multimodal
_
plugin>`.
Read more on that :ref:`here <adding
-
multimodal
-
plugin>`.
"""
...
...
@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
mm_kwargs
:
MultiModalKwargs
"""Keyword arguments to be directly passed to the model after batching."""
mm_hashes
:
NotRequired
[
List
[
str
]]
"""The hashes of the multi-modal data."""
mm_placeholders
:
MultiModalPlaceholderDict
"""
For each modality, information about the placeholder tokens in
...
...
vllm/multimodal/processing.py
View file @
96ae75ad
...
...
@@ -17,6 +17,7 @@ from vllm.logger import init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
MistralTokenizer
from
vllm.utils
import
flatten_2d_lists
,
full_groupby
,
is_list_of
from
.audio
import
resample_audio
from
.inputs
import
(
AudioItem
,
ImageItem
,
MultiModalDataDict
,
MultiModalInputsV2
,
MultiModalKwargs
,
PlaceholderRange
,
VideoItem
)
...
...
@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]]
@
dataclass
class
PromptReplacement
:
modality
:
str
"""The modality for which the replacement is made"""
"""The modality for which the replacement is made
.
"""
target
:
_PromptSeq
"""The text or token sequence to find and replace."""
...
...
@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
corresponds to a list.
"""
@
staticmethod
def
from_dict
(
data
:
MultiModalDataDict
)
->
"MultiModalDataItems"
:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data
=
MultiModalDataItems
()
for
k
,
v
in
data
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
(
isinstance
(
v
,
torch
.
Tensor
)
or
is_list_of
(
v
,
list
))
else
[
v
]
)
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
isinstance
(
v
,
(
torch
.
Tensor
,
list
))
else
[
v
]
)
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
return
multi_data
# NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
# `self.images` doesn't update this dictionary, which may be confusing
# We annotate the getter methods as `Sequence` to prevent others from
# trying to update the list in this way
@
property
def
image
(
self
)
->
list
[
ImageItem
]:
return
self
[
"image"
]
def
image
s
(
self
)
->
Sequence
[
ImageItem
]:
return
self
.
get
(
"image"
,
[])
@
property
def
video
(
self
)
->
list
[
VideoItem
]:
return
self
[
"video"
]
def
video
s
(
self
)
->
Sequence
[
VideoItem
]:
return
self
.
get
(
"video"
,
[])
@
property
def
audio
(
self
)
->
list
[
AudioItem
]:
return
self
[
"audio"
]
def
audios
(
self
)
->
Sequence
[
AudioItem
]:
return
self
.
get
(
"audio"
,
[])
def
get_item_counts
(
self
)
->
Mapping
[
str
,
int
]:
return
{
m
:
len
(
items
)
for
m
,
items
in
self
.
items
()}
def
get_image_size
(
self
,
item_idx
:
int
)
->
ImageSize
:
image
=
self
.
image
[
item_idx
]
image
=
self
.
image
s
[
item_idx
]
if
isinstance
(
image
,
Image
):
return
ImageSize
(
*
image
.
size
)
...
...
@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
assert_never
(
image
)
def
get_audio_with_sr
(
self
,
item_idx
:
int
,
*
,
default_sr
:
float
,
)
->
tuple
[
np
.
ndarray
,
float
]:
audio
=
self
.
audios
[
item_idx
]
if
isinstance
(
audio
,
tuple
):
return
audio
if
isinstance
(
audio
,
list
):
return
np
.
array
(
audio
),
default_sr
if
isinstance
(
audio
,
np
.
ndarray
):
return
audio
,
default_sr
assert_never
(
audio
)
def
resample_audios
(
self
,
new_sr
:
float
,
*
,
drop_sr
:
bool
=
True
)
->
None
:
"""
If :code:`drop_sr=True`, the audio items in this dictionary are updated
to be NumPy arrays which implicitly means that their sampling rate is
the same as the model's expected sampling rate; otherwise, they remain
as :code:`(audio, new_sr)` tuples.
"""
if
not
self
.
audios
:
return
def
to_multi_format
(
data
:
MultiModalDataDict
)
->
MultiModalDataItems
:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data
=
MultiModalDataItems
()
for
k
,
v
in
data
.
items
():
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
v
if
is_list_of
(
v
,
list
)
else
[
v
]
# type: ignore[index]
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
new_audios
=
[]
for
item_idx
in
range
(
len
(
self
.
audios
)):
audio
,
sr
=
self
.
get_audio_with_sr
(
item_idx
,
default_sr
=
new_sr
)
audio
=
resample_audio
(
audio
,
orig_sr
=
sr
,
target_sr
=
new_sr
)
new_audios
.
append
(
audio
if
drop_sr
else
(
audio
,
new_sr
))
return
multi_data
self
[
"audio"
]
=
new_audios
class
_TokenMatch
(
NamedTuple
):
...
...
@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
def
_get_tokenizer
(
self
)
->
AnyTokenizer
:
return
self
.
ctx
.
tokenizer
def
_get_mm_items
(
self
,
mm_data
:
MultiModalDataDict
,
)
->
MultiModalDataItems
:
return
MultiModalDataItems
.
from_dict
(
mm_data
)
@
abstractmethod
def
_get_prompt_replacements
(
self
,
...
...
@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC):
def
_get_processor_data
(
self
,
mm_
data
:
MultiModalData
Dict
,
)
->
BatchFeature
:
mm_
items
:
MultiModalData
Items
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]
:
processor_data
=
dict
[
str
,
Any
]()
passthrough_data
=
dict
[
str
,
Any
]()
for
k
,
v
in
mm_data
.
items
():
for
k
,
v
in
mm_items
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
if
k
in
(
"image"
,
"video"
,
"audio"
):
if
isinstance
(
v
,
torch
.
Tensor
)
and
v
.
ndim
==
3
:
# Pass through embedding inputs (single)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
[
v
]
elif
is_list_of
(
v
,
torch
.
Tensor
)
and
v
[
0
].
ndim
==
2
:
elif
(
is_list_of
(
v
,
torch
.
Tensor
)
and
len
(
v
)
>
0
and
v
[
0
].
ndim
==
2
):
# Pass through embedding inputs (multi)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
v
else
:
...
...
@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC):
processor_data
[
f
"
{
k
}
s"
]
=
v
else
:
processor_data
[
k
]
=
v
return
processor_data
,
passthrough_data
def
_call_hf_processor
(
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
processor_data
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
return
self
.
ctx
.
call_hf_processor
(
hf_processor
,
prompt
,
processor_data
,
mm_processor_kwargs
,
)
def
_apply_hf_processor
(
self
,
prompt
:
str
,
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
# some mm_processor_kwargs may be used in processor initialization
# instead of processor call
hf_processor
=
self
.
_get_hf_processor
(
**
mm_processor_kwargs
)
processor_data
,
passthrough_data
=
self
.
_get_processor_data
(
mm_
data
)
processor_data
,
passthrough_data
=
self
.
_get_processor_data
(
mm_
items
)
assert
callable
(
hf_processor
)
mm_processor_kwargs
=
self
.
ctx
.
resolve_hf_processor_call_kwargs
(
hf_inputs
=
self
.
_call_hf_processor
(
hf_processor
,
mm_processor_kwargs
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
try
:
hf_inputs
=
hf_processor
(
text
=
prompt
,
# type: ignore
**
processor_data
,
**
mm_processor_kwargs
,
return_tensors
=
"pt"
,
)
except
Exception
as
exc
:
data
=
dict
(
text
=
prompt
,
**
processor_data
)
raise
RuntimeError
(
f
"Failed to apply
{
type
(
hf_processor
).
__name__
}
"
f
"on data=
{
data
}
with kwargs=
{
mm_processor_kwargs
}
"
)
from
exc
hf_inputs
.
update
(
passthrough_data
)
return
hf_inputs
...
...
@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC):
3. Extract information about the placeholder tokens from the
processed token IDs.
"""
tokenizer
=
self
.
_get_
tokenizer
(
)
mm_items
=
self
.
_get_
mm_items
(
mm_data
)
hf_inputs
=
self
.
_apply_hf_processor
(
prompt_text
,
mm_
data
,
hf_inputs
=
self
.
_apply_hf_processor
(
prompt_text
,
mm_
items
,
mm_processor_kwargs
)
prompt_ids
,
=
hf_inputs
.
pop
(
"input_ids"
).
tolist
()
mm_kwargs
=
MultiModalKwargs
(
hf_inputs
)
mm_items
=
to_multi_format
(
mm_data
)
prompt_repls
=
self
.
_get_prompt_replacements
(
mm_items
,
hf_inputs
,
mm_processor_kwargs
)
all_prompt_repls
=
self
.
_bind_prompt_replacements
(
prompt_repls
)
# If HF processor already inserts placeholder tokens,
# there is no need for us to insert them
mm_item_counts
=
{
m
:
len
(
items
)
for
m
,
items
in
mm_items
.
item
s
()
}
mm_item_counts
=
mm_items
.
get_item_count
s
()
all_placeholders
=
self
.
_find_placeholders
(
all_prompt_repls
,
prompt_ids
,
mm_item_counts
)
if
all_placeholders
:
tokenizer
=
self
.
_get_tokenizer
()
prompt_text
=
_decode
(
tokenizer
,
prompt_ids
)
else
:
(
...
...
vllm/multimodal/registry.py
View file @
96ae75ad
...
...
@@ -76,7 +76,7 @@ class MultiModalRegistry:
Register a multi-modal plugin so it can be recognized by vLLM.
See also:
:ref:`adding
_
multimodal
_
plugin`
:ref:`adding
-
multimodal
-
plugin`
"""
data_type_key
=
plugin
.
get_data_key
()
...
...
@@ -311,8 +311,8 @@ class MultiModalRegistry:
invoked to transform the data into a dictionary of model inputs.
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
-
multimodal
-
inputs`
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
...
...
vllm/multimodal/utils.py
View file @
96ae75ad
...
...
@@ -2,7 +2,7 @@ import base64
import
os
from
functools
import
lru_cache
from
io
import
BytesIO
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
TypeVar
,
Union
from
typing
import
List
,
Optional
,
Tuple
,
TypeVar
,
Union
import
numpy
as
np
import
numpy.typing
as
npt
...
...
@@ -14,9 +14,25 @@ import vllm.envs as envs
from
vllm.connections
import
global_http_connection
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_tokenizer
from
vllm.utils
import
PlaceholderModule
from
.inputs
import
MultiModalDataDict
,
PlaceholderRange
try
:
import
decord
except
ImportError
:
decord
=
PlaceholderModule
(
"decord"
)
# type: ignore[assignment]
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
logger
=
init_logger
(
__name__
)
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
...
...
@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str,
return
image
.
convert
(
image_mode
)
def
_load_video_frames_from_bytes
(
b
:
bytes
):
frame
=
Image
.
open
(
BytesIO
(
b
))
return
np
.
array
(
frame
)
def
load_video_frames_from_base64
(
frame
:
Union
[
bytes
,
str
]):
"""Load frame from base64 format."""
return
_load_video_frames_from_bytes
(
base64
.
b64decode
(
frame
))
def
_load_video_from_bytes
(
b
:
bytes
,
num_frames
:
int
=
32
):
_
,
decord
=
try_import_video_packages
()
def
_load_video_from_bytes
(
b
:
bytes
,
num_frames
:
int
=
32
)
->
npt
.
NDArray
:
video_path
=
BytesIO
(
b
)
vr
=
decord
.
VideoReader
(
video_path
,
num_threads
=
1
)
total_frame_num
=
len
(
vr
)
...
...
@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
return
frames
def
_load_video_from_data_url
(
video_url
:
str
):
# Only split once and assume the second part is the base64 encoded image
frames_base64
=
video_url
.
split
(
","
)[
1
:]
return
np
.
stack
([
load_video_frames_from_base64
(
frame_base64
)
for
frame_base64
in
frames_base64
])
def
_load_video_from_data_url
(
video_url
:
str
)
->
npt
.
NDArray
:
# Only split once and assume the second part is the base64 encoded video
_
,
video_base64
=
video_url
.
split
(
","
,
1
)
if
video_url
.
startswith
(
"data:video/jpeg;"
):
return
np
.
stack
([
np
.
array
(
load_image_from_base64
(
frame_base64
))
for
frame_base64
in
video_base64
.
split
(
","
)
])
return
load_video_from_base64
(
video_base64
)
def
fetch_video
(
video_url
:
str
,
*
,
num_frames
:
int
=
32
)
->
npt
.
NDArray
:
...
...
@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str,
return
video
def
try_import_audio_packages
()
->
Tuple
[
Any
,
Any
]:
try
:
import
librosa
import
soundfile
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[audio] for audio support."
)
from
exc
return
librosa
,
soundfile
def
fetch_audio
(
audio_url
:
str
)
->
Tuple
[
np
.
ndarray
,
Union
[
int
,
float
]]:
"""
Load audio from a URL.
"""
librosa
,
_
=
try_import_audio_packages
()
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
global_http_connection
.
get_bytes
(
audio_url
,
...
...
@@ -253,8 +249,6 @@ async def async_fetch_audio(
"""
Asynchronously fetch audio from a URL.
"""
librosa
,
_
=
try_import_audio_packages
()
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
await
global_http_connection
.
async_get_bytes
(
audio_url
,
...
...
@@ -313,8 +307,6 @@ def encode_audio_base64(
sampling_rate
:
int
,
)
->
str
:
"""Encode audio as base64."""
_
,
soundfile
=
try_import_audio_packages
()
buffered
=
BytesIO
()
soundfile
.
write
(
buffered
,
audio
,
sampling_rate
,
format
=
"WAV"
)
...
...
@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
return
_load_image_from_bytes
(
base64
.
b64decode
(
image
))
def
rescale_image_size
(
image
:
Image
.
Image
,
size_factor
:
float
,
transpose
:
int
=
-
1
)
->
Image
.
Image
:
"""Rescale the dimensions of an image by a constant factor."""
new_width
=
int
(
image
.
width
*
size_factor
)
new_height
=
int
(
image
.
height
*
size_factor
)
image
=
image
.
resize
((
new_width
,
new_height
))
if
transpose
>=
0
:
image
=
image
.
transpose
(
Image
.
Transpose
(
transpose
))
return
image
def
try_import_video_packages
()
->
Any
:
try
:
import
cv2
import
decord
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[video] for video support."
)
from
exc
return
cv2
,
decord
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
Tuple
[
int
,
int
])
->
npt
.
NDArray
:
cv2
,
_
=
try_import_video_packages
()
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
else
:
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
def
encode_video_base64
(
frames
:
npt
.
NDArray
):
def
encode_video_base64
(
frames
:
npt
.
NDArray
)
->
str
:
base64_frames
=
[]
frames_list
=
[
frames
[
i
]
for
i
in
range
(
frames
.
shape
[
0
])]
for
frame
in
frames_list
:
...
...
@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray):
return
","
.
join
(
base64_frames
)
def
load_video_from_base64
(
video
:
Union
[
bytes
,
str
])
->
npt
.
NDArray
:
"""Load video from base64 format."""
return
_load_video_from_bytes
(
base64
.
b64decode
(
video
))
def
resolve_visual_encoder_outputs
(
encoder_outputs
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
feature_sample_layers
:
Optional
[
list
[
int
]],
...
...
vllm/multimodal/video.py
View file @
96ae75ad
from
functools
import
lru_cache
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
import
cv2
import
numpy
as
np
import
numpy.typing
as
npt
from
vllm.inputs.registry
import
InputContext
from
vllm.logger
import
init_logger
...
...
@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
4096
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
tuple
[
int
,
int
])
->
npt
.
NDArray
:
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
vllm/outputs.py
View file @
96ae75ad
...
...
@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]):
pooled_data
=
seq_group
.
pooled_data
assert
pooled_data
is
not
None
output
=
PoolingOutput
(
pooled_data
)
data
=
pooled_data
.
to
(
dtype
=
torch
.
float32
,
device
=
"cpu"
)
output
=
PoolingOutput
(
data
)
prompt_token_ids
=
seq_group
.
prompt_token_ids
finished
=
seq_group
.
is_finished
()
...
...
vllm/platforms/cpu.py
View file @
96ae75ad
...
...
@@ -54,7 +54,7 @@ class CpuPlatform(Platform):
import
vllm.envs
as
envs
from
vllm.utils
import
GiB_bytes
model_config
=
vllm_config
.
model_config
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
not
model_config
.
enforce_eager
:
logger
.
warning
(
...
...
vllm/scripts.py
View file @
96ae75ad
...
...
@@ -165,7 +165,7 @@ def main():
required
=
False
,
help
=
"Read CLI options from a config file."
"Must be a YAML with the following options:"
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#c
ommand-line-arguments-for-the-server
"
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#c
li-reference
"
)
serve_parser
=
make_arg_parser
(
serve_parser
)
serve_parser
.
set_defaults
(
dispatch_function
=
serve
)
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
96ae75ad
...
...
@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
return
spec_decode_worker
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
class
SpecDecodeWorker
(
LoraNotSupportedWorkerBase
):
"""Worker which implements speculative decoding.
...
...
Prev
1
…
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment