Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
639 additions
and
951 deletions
+639
-951
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+115
-190
vllm/model_executor/models/qwen2_cls.py
vllm/model_executor/models/qwen2_cls.py
+0
-104
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+192
-398
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+6
-13
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/telechat2.py
+14
-13
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+60
-60
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+9
-0
vllm/multimodal/__init__.py
vllm/multimodal/__init__.py
+1
-1
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+18
-0
vllm/multimodal/base.py
vllm/multimodal/base.py
+7
-7
vllm/multimodal/image.py
vllm/multimodal/image.py
+12
-0
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+17
-13
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+113
-53
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+3
-3
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+35
-92
vllm/multimodal/video.py
vllm/multimodal/video.py
+32
-0
vllm/outputs.py
vllm/outputs.py
+2
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+1
-1
vllm/scripts.py
vllm/scripts.py
+1
-1
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+1
-1
No files found.
vllm/model_executor/models/qwen2_audio.py
View file @
96ae75ad
...
@@ -19,45 +19,43 @@
...
@@ -19,45 +19,43 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
from
functools
import
cached_property
,
lru_cache
from
functools
import
cached_property
from
typing
import
(
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
from
typing
import
(
Any
,
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
Union
)
TypedDict
,
Union
)
import
librosa
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
transformers
import
Qwen2AudioEncoder
from
transformers
import
BatchFeature
,
ProcessorMixin
from
transformers.models.qwen2_audio
import
(
Qwen2AudioConfig
,
Qwen2AudioEncoder
,
Qwen2AudioProcessor
)
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.attention
import
AttentionMetadata
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
from
vllm.inputs
import
InputContext
InputContext
,
token_inputs
)
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
MultiModalDataItems
,
ProcessorInputs
,
PromptReplacement
)
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
maybe_prefix
,
merge_multimodal_embeddings
)
logger
=
init_logger
(
__name__
)
# # === Audio Inputs === #
# # === Audio Inputs === #
class
Qwen2AudioInputs
(
TypedDict
):
class
Qwen2AudioInputs
(
TypedDict
):
input_features
:
torch
.
Tensor
input_features
:
torch
.
Tensor
"""Shape:
"""Shape: `(num_audios, num_mel_bins, 3000)`"""
`(num_audios, num_mel_bins, 3000)`
"""
feature_attention_mask
:
torch
.
Tensor
feature_attention_mask
:
torch
.
Tensor
"""Shape: `(num_audios, 3000)`
"""Shape: `(num_audios, 3000)`"""
"""
# === Audio Encoder === #
# === Audio Encoder === #
...
@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module):
...
@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module):
return
hidden_states
return
hidden_states
def
dummy_data_for_qwen2_audio
(
ctx
:
InputContext
,
seq_len
:
int
,
# From Qwen2AudioEncoder._get_feat_extract_output_lengths
mm_counts
:
Mapping
[
str
,
int
]):
def
_get_feat_extract_output_lengths
(
input_lengths
:
torch
.
LongTensor
):
num_audios
=
mm_counts
[
"audio"
]
feat_lengths
=
(
input_lengths
-
1
)
//
2
+
1
max_tokens_per_audio
=
get_max_qwen2_audio_audio_tokens
(
ctx
)
output_lengths
=
(
feat_lengths
-
2
)
//
2
+
1
max_llm_audio_tokens
=
max_tokens_per_audio
*
num_audios
return
feat_lengths
,
output_lengths
if
seq_len
-
max_llm_audio_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-Audio cannot process
{
num_audios
}
audios in a prompt, "
"please increase max_model_len or reduce audio limit by "
"--limit-mm-per-prompt."
)
audio_token_index
=
ctx
.
model_config
.
hf_config
.
audio_token_index
dummy_seqdata
=
SequenceData
.
from_prompt_token_counts
(
(
audio_token_index
,
max_llm_audio_tokens
),
(
0
,
seq_len
-
max_llm_audio_tokens
),
)
dummy_audio
=
np
.
full
((
max_llm_audio_tokens
*
2
*
2
*
160
,
),
0.
)
return
DummyData
(
dummy_seqdata
,
{
"audio"
:
[(
dummy_audio
,
16000
)]
*
num_audios
},
{
"audio"
:
consecutive_placeholder_ranges
(
num_items
=
num_audios
,
item_size
=
max_tokens_per_audio
)
})
def
get_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets a processor for the given model name via HuggingFace.
Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
"""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
processor
def
get_max_qwen2_audio_audio_tokens
(
ctx
:
InputContext
)
->
int
:
hf_config
=
ctx
.
get_hf_config
(
Qwen2AudioConfig
)
max_source_position
=
hf_config
.
audio_config
.
max_source_positions
output_lengths
=
(
max_source_position
-
2
)
//
2
+
1
return
output_lengths
cached_get_processor
=
lru_cache
(
get_processor
)
class
Qwen2AudioMultiModalProcessor
(
BaseMultiModalProcessor
):
def
_get_feat_extract_output_lengths
(
input_lengths
:
torch
.
LongTensor
):
def
_get_hf_processor
(
self
)
->
Qwen2AudioProcessor
:
"""
return
self
.
ctx
.
get_hf_processor
(
Qwen2AudioProcessor
)
Computes the output length of the convolutional layers
and the output length of the audio encoder
"""
input_lengths
=
(
input_lengths
-
1
)
//
2
+
1
output_lengths
=
(
input_lengths
-
2
)
//
2
+
1
return
input_lengths
,
output_lengths
def
_get_feature_extractor
(
self
)
->
WhisperFeatureExtractor
:
return
self
.
_get_hf_processor
().
feature_extractor
# type: ignore
def
get_max_qwen2_audio_audio_tokens
(
ctx
:
InputContext
)
->
int
:
def
_get_processor_data
(
max_source_position
=
(
self
,
ctx
.
model_config
.
hf_config
.
audio_config
.
max_source_positions
)
mm_items
:
MultiModalDataItems
,
output_lengths
=
(
max_source_position
-
2
)
//
2
+
1
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
return
output_lengths
# resample audio to the model's sampling rate
feature_extractor
=
self
.
_get_feature_extractor
()
mm_items
.
resample_audios
(
feature_extractor
.
sampling_rate
)
return
super
().
_get_processor_data
(
mm_items
)
def
input_processor_for_qwen2_audio
(
def
_call_hf_processor
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
)
->
DecoderOnlyInputs
:
self
,
multi_modal_data
=
inputs
.
get
(
"multi_modal_data"
)
hf_processor
:
ProcessorMixin
,
if
multi_modal_data
is
None
or
"audio"
not
in
multi_modal_data
:
prompt
:
str
,
return
inputs
processor_data
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
audios
=
multi_modal_data
[
"audio"
]
)
->
BatchFeature
:
if
not
isinstance
(
audios
,
list
):
processor_data
=
dict
(
processor_data
)
audios
=
[
audios
]
audios
=
processor_data
.
pop
(
"audios"
,
[])
if
len
(
audios
)
==
0
:
if
audios
:
return
inputs
processor_data
[
"audios"
]
=
audios
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
feature_extractor
=
self
.
_get_feature_extractor
()
resampled_audios
=
[
mm_processor_kwargs
=
dict
(
librosa
.
resample
(
audio
,
**
mm_processor_kwargs
,
orig_sr
=
sampling_rate
,
sampling_rate
=
feature_extractor
.
sampling_rate
,
target_sr
=
processor
.
feature_extractor
.
sampling_rate
)
)
for
audio
,
sampling_rate
in
audios
else
:
]
# NOTE: WhisperFeatureExtractor cannot handle empty list of audios
audio_input_lengths
=
np
.
array
(
pass
[
min
(
3000
,
_
.
shape
[
0
]
//
160
+
1
)
for
_
in
resampled_audios
])
return
super
().
_call_hf_processor
(
audio_feat_lengths
,
audio_output_lengths
=
_get_feat_extract_output_lengths
(
hf_processor
,
audio_input_lengths
)
prompt
=
prompt
,
processor_data
=
processor_data
,
audio_token_index
=
ctx
.
model_config
.
hf_config
.
audio_token_index
mm_processor_kwargs
=
mm_processor_kwargs
,
)
input_ids
=
inputs
[
'prompt_token_ids'
]
def
_get_prompt_replacements
(
new_input_ids
=
[]
self
,
audio_num
=
input_ids
.
count
(
audio_token_index
)
mm_items
:
MultiModalDataItems
,
assert
len
(
audio_input_lengths
)
==
audio_num
,
\
hf_inputs
:
BatchFeature
,
(
f
'The text input contains
{
audio_num
}
audio tokens, '
mm_processor_kwargs
:
Mapping
[
str
,
object
],
f
'but
{
len
(
audio_input_lengths
)
}
audios provided'
)
)
->
list
[
PromptReplacement
]:
start
=
0
hf_config
=
self
.
ctx
.
get_hf_config
(
Qwen2AudioConfig
)
for
audio_idx
in
range
(
audio_num
):
placeholder
=
hf_config
.
audio_token_index
end
=
input_ids
.
index
(
audio_token_index
,
start
)
new_input_ids
.
extend
(
input_ids
[
start
:
end
])
# text part
feature_attention_mask
=
hf_inputs
.
get
(
"feature_attention_mask"
)
if
feature_attention_mask
is
None
:
new_input_ids
.
extend
([
audio_token_index
]
*
audio_output_lengths
=
[]
audio_output_lengths
[
audio_idx
])
else
:
start
=
end
+
1
_
,
audio_output_lengths
=
_get_feat_extract_output_lengths
(
new_input_ids
.
extend
(
input_ids
[
start
:])
feature_attention_mask
.
sum
(
-
1
))
return
token_inputs
(
def
get_replacement_qwen2_audio
(
item_idx
:
int
):
prompt_token_ids
=
new_input_ids
,
return
[
placeholder
]
*
audio_output_lengths
[
item_idx
]
prompt
=
inputs
.
get
(
"prompt"
),
multi_modal_data
=
multi_modal_data
,
return
[
)
PromptReplacement
(
modality
=
"audio"
,
target
=
[
placeholder
],
def
input_mapper_for_qwen2_audio
(
replacement
=
get_replacement_qwen2_audio
,
ctx
:
InputContext
,
)
multi_modal_data
:
Union
[
np
.
ndarray
,
List
[
np
.
ndarray
]],
)
->
MultiModalKwargs
:
"""Input mapper for Qwen2-Audio."""
if
not
isinstance
(
multi_modal_data
,
list
):
multi_modal_data
=
[
multi_modal_data
]
if
len
(
multi_modal_data
)
==
0
:
return
MultiModalKwargs
()
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
audio_feature_extractor
=
processor
.
feature_extractor
if
audio_feature_extractor
is
None
:
raise
RuntimeError
(
"No HuggingFace audio_feature_extractor is available "
"to process the audio object"
)
try
:
resampled_audios
=
[
librosa
.
resample
(
audio
,
orig_sr
=
sampling_rate
,
target_sr
=
processor
.
feature_extractor
.
sampling_rate
)
for
audio
,
sampling_rate
in
multi_modal_data
]
]
batch_data
=
audio_feature_extractor
(
resampled_audios
,
sampling_rate
=
16000
,
def
_get_dummy_mm_inputs
(
return_attention_mask
=
True
,
self
,
padding
=
"max_length"
,
mm_counts
:
Mapping
[
str
,
int
],
return_tensors
=
"pt"
).
data
)
->
ProcessorInputs
:
batch_data
[
"feature_attention_mask"
]
=
batch_data
.
pop
(
"attention_mask"
)
feature_extractor
=
self
.
_get_feature_extractor
()
except
Exception
:
sampling_rate
=
feature_extractor
.
sampling_rate
logger
.
error
(
"Failed to process audio (%s)"
,
multi_modal_data
)
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
raise
audio_count
=
mm_counts
[
"audio"
]
return
MultiModalKwargs
(
batch_data
)
audio
=
np
.
zeros
(
audio_len
)
data
=
{
"audio"
:
[
audio
]
*
audio_count
}
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_qwen2_audio
)
return
ProcessorInputs
(
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_qwen2_audio
)
prompt_text
=
"<|AUDIO|>"
*
audio_count
,
@
MULTIMODAL_REGISTRY
.
register_input_mapper
(
"audio"
,
mm_data
=
data
,
input_mapper_for_qwen2_audio
)
mm_processor_kwargs
=
{},
)
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
"audio"
,
get_max_qwen2_audio_audio_tokens
)
"audio"
,
get_max_qwen2_audio_audio_tokens
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Qwen2AudioMultiModalProcessor
)
class
Qwen2AudioForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
class
Qwen2AudioForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
SupportsPP
):
...
@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return
get_sampler
()
return
get_sampler
()
def
_validate_and_reshape_mm_tensor
(
self
,
def
_validate_and_reshape_mm_tensor
(
self
,
mm_input
:
object
,
mm_input
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]],
name
:
str
)
->
torch
.
Tensor
:
name
:
str
)
->
torch
.
Tensor
:
if
not
isinstance
(
mm_input
,
(
torch
.
Tensor
,
list
)):
if
not
isinstance
(
mm_input
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
f
"Incorrect type of
{
name
}
. "
raise
ValueError
(
f
"Incorrect type of
{
name
}
. "
...
...
vllm/model_executor/models/qwen2_cls.py
deleted
100644 → 0
View file @
f9f4a735
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 Kakao Corp. (Kanana-X Team)
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
"""Inference-only Qwen2-Classification model compatible with HF weights."""
from
typing
import
Iterable
,
List
,
Optional
,
Set
,
Tuple
import
torch
from
torch
import
nn
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
RowParallelLinear
from
vllm.model_executor.layers.pooler
import
Pooler
,
PoolingType
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
AutoWeightsLoader
,
maybe_prefix
class
Qwen2ForSequenceClassification
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
pooler_config
=
vllm_config
.
model_config
.
pooler_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
Qwen2Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
# hidden_states from Qwen2Model has been reduced,
# the input of score layer is not parallelized.
self
.
score
=
RowParallelLinear
(
config
.
hidden_size
,
config
.
num_labels
,
quant_config
=
quant_config
,
input_is_parallel
=
False
,
bias
=
False
,
prefix
=
maybe_prefix
(
prefix
,
"score"
))
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
pooler_config
,
pooling_type
=
PoolingType
.
LAST
,
normalize
=
False
,
softmax
=
True
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
logits
,
_
=
self
.
score
(
hidden_states
)
return
logits
def
pooler
(
self
,
hidden_states
:
torch
.
Tensor
,
pooling_metadata
:
PoolingMetadata
,
)
->
Optional
[
PoolerOutput
]:
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
ignore_unexpected_prefixes
=
[
"lm_head."
])
return
loader
.
load_weights
(
weights
)
vllm/model_executor/models/qwen2_vl.py
View file @
96ae75ad
...
@@ -22,28 +22,26 @@
...
@@ -22,28 +22,26 @@
# limitations under the License.
# limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from
functools
import
cached_property
,
partial
from
functools
import
cached_property
,
partial
from
typing
import
(
Any
,
Callable
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
from
typing
import
(
Any
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Optional
,
Set
,
Tuple
,
Type
,
TypedDict
,
Union
)
Tuple
,
Type
,
TypedDict
,
Union
)
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
einops
import
rearrange
,
repeat
from
PIL
import
Image
from
PIL
import
Image
from
transformers
.image_utils
import
(
get_image_size
,
from
transformers
import
BatchFeature
infer_channel_dimension_format
,
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
to_numpy_array
)
Qwen2VLProcessor
)
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
Qwen2VLConfig
,
Qwen2VLVisionConfig
)
Qwen2VLConfig
,
Qwen2VLVisionConfig
)
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
(
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
make_batched_images
,
make_batched_videos
,
smart_resize
)
from
vllm.attention
import
AttentionMetadata
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
from
vllm.inputs
import
InputContext
InputContext
,
token_inputs
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.activation
import
QuickGELU
...
@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
...
@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.i
mage
import
cached_get_image_proces
sor
from
vllm.multimodal.i
nputs
import
MultiModalDataDict
,
NestedTen
sor
s
from
vllm.multimodal.
inputs
import
(
MultiModal
Data
,
MultiModalDataDict
,
from
vllm.multimodal.
processing
import
(
Base
MultiModal
Processor
,
MultiModal
Kwargs
,
NestedTensors
)
MultiModal
DataItems
,
ProcessorInputs
,
from
vllm.multimodal.utils
import
cached_get_tokenizer
PromptReplacement
)
from
vllm.platforms
import
_Backend
from
vllm.platforms
import
_Backend
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.
transformers_utils.processor
import
cached_get_processor
from
vllm.
utils
import
is_list_of
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
get_vit_attn_backend
,
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
get_vit_attn_backend
,
...
@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module):
...
@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module):
def
__init__
(
def
__init__
(
self
,
self
,
in_features
:
int
,
in_features
:
int
,
hidden_features
:
int
=
None
,
hidden_features
:
int
,
act_layer
:
Type
[
nn
.
Module
]
=
QuickGELU
,
act_layer
:
Type
[
nn
.
Module
]
=
QuickGELU
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
...
@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module):
...
@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module):
# === Vision input helpers === #
# === Vision input helpers === #
def
get_mm_processor_kwargs
(
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
)
->
Dict
[
str
,
int
]:
mm_processor_kwargs
=
{}
if
min_pixels
:
mm_processor_kwargs
[
"min_pixels"
]
=
min_pixels
if
max_pixels
:
mm_processor_kwargs
[
"max_pixels"
]
=
max_pixels
return
mm_processor_kwargs
def
mm_input_mapper_for_qwen2_vl
(
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
data_type_key
:
str
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
MultiModalKwargs
:
"""Input mapper for Qwen2-VL."""
if
data_type_key
==
"image"
and
isinstance
(
data
,
dict
):
return
MultiModalKwargs
({
"image_embeds"
:
data
.
get
(
"image_embeds"
),
"image_grid_thw"
:
data
.
get
(
"image_grid_thw"
),
})
if
data_type_key
==
"video"
and
isinstance
(
data
,
dict
):
return
MultiModalKwargs
({
"video_embeds"
:
data
.
get
(
"video_embeds"
),
"video_grid_thw"
:
data
.
get
(
"video_grid_thw"
),
})
model_config
=
ctx
.
model_config
# Handle mm processor kwargs; we pass these at creation time
# because preprocess() in transformers doesn't expose them
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
cached_get_image_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
mm_processor_kwargs
,
)
if
image_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the image object"
)
images
=
None
videos
=
None
if
data_type_key
==
"image"
:
images
=
data
else
:
assert
data_type_key
==
"video"
videos
=
data
try
:
batch_data
=
image_processor
\
.
preprocess
(
images
=
images
,
videos
=
videos
,
return_tensors
=
"pt"
)
\
.
data
except
Exception
:
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModalKwargs
(
batch_data
)
image_input_mapper_for_qwen2_vl
=
partial
(
mm_input_mapper_for_qwen2_vl
,
data_type_key
=
"image"
)
video_input_mapper_for_qwen2_vl
=
partial
(
mm_input_mapper_for_qwen2_vl
,
data_type_key
=
"video"
)
def
_get_vision_info
(
def
_get_vision_info
(
image_processor
,
vision_config
:
Qwen2VLVisionConfig
,
height
:
int
,
height
:
int
,
width
:
int
,
width
:
int
,
min_pixels
:
int
,
min_pixels
:
int
,
...
@@ -775,12 +703,15 @@ def _get_vision_info(
...
@@ -775,12 +703,15 @@ def _get_vision_info(
):
):
"""Get information (resized height / width and number of vision tokens)
"""Get information (resized height / width and number of vision tokens)
of input image / video frame."""
of input image / video frame."""
patch_size
=
vision_config
.
patch_size
merge_size
=
vision_config
.
spatial_merge_size
temporal_patch_size
=
vision_config
.
temporal_patch_size
if
do_resize
:
if
do_resize
:
resized_height
,
resized_width
=
smart_resize
(
resized_height
,
resized_width
=
smart_resize
(
height
=
height
,
height
=
height
,
width
=
width
,
width
=
width
,
factor
=
image_processor
.
patch_size
*
image_processor
.
merge_size
,
factor
=
patch_size
*
merge_size
,
min_pixels
=
min_pixels
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
max_pixels
=
max_pixels
,
)
)
...
@@ -791,54 +722,41 @@ def _get_vision_info(
...
@@ -791,54 +722,41 @@ def _get_vision_info(
grid_t
=
mm_count
grid_t
=
mm_count
else
:
else
:
assert
data_type_key
==
"video"
assert
data_type_key
==
"video"
grid_t
=
max
(
mm_count
//
image_processor
.
temporal_patch_size
,
1
)
grid_t
=
max
(
mm_count
//
temporal_patch_size
,
1
)
grid_h
=
resized_height
//
image_processor
.
patch_size
grid_h
=
resized_height
//
patch_size
grid_w
=
resized_width
//
image_processor
.
patch_size
grid_w
=
resized_width
//
patch_size
vision_tokens
=
grid_t
*
grid_h
*
grid_w
vision_tokens
=
grid_t
*
grid_h
*
grid_w
llm_num_vision_tokens
=
(
vision_tokens
//
image_processor
.
merge_size
//
llm_num_vision_tokens
=
vision_tokens
//
(
merge_size
**
2
)
image_processor
.
merge_size
)
return
resized_height
,
resized_width
,
llm_num_vision_tokens
return
resized_height
,
resized_width
,
llm_num_vision_tokens
def
_get_max_image_info
(
def
_get_image_processor
(
hf_processor
:
Qwen2VLProcessor
):
image_processor
,
image_processor
=
hf_processor
.
image_processor
# type: ignore
data_type_key
:
str
=
"image"
,
assert
isinstance
(
image_processor
,
Qwen2VLImageProcessor
)
mm_count
:
int
=
1
,
return
image_processor
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
):
# Limit min / max pixels unless they're explicitly provided
if
min_pixels
is
None
:
min_pixels
=
max
(
image_processor
.
min_pixels
,
28
*
28
)
if
max_pixels
is
None
:
max_pixels
=
min
(
image_processor
.
max_pixels
,
1280
*
28
*
28
)
return
_get_vision_info
(
image_processor
,
height
=
9999999
,
width
=
9999999
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
data_type_key
=
data_type_key
,
mm_count
=
mm_count
,
)
def
get_max_qwen2_vl_mm_tokens
(
ctx
:
InputContext
,
def
get_max_qwen2_vl_mm_tokens
(
ctx
:
InputContext
,
data_type_key
:
str
,
data_type_key
:
str
,
*
,
*
,
min_pixels
=
None
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
=
None
)
->
int
:
max_pixels
:
Optional
[
int
]
=
None
)
->
int
:
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
max_pixels
=
max_pixels
)
vision_config
=
hf_config
.
vision_config
image_processor
=
cached_get_image_processor
(
ctx
.
model_config
.
model
,
**
mm_processor_kwargs
)
hf_processor
=
ctx
.
get_hf_processor
(
Qwen2VLProcessor
)
max_resized_height
,
max_resized_width
,
max_llm_image_tokens
=
\
image_processor
=
_get_image_processor
(
hf_processor
)
_get_max_image_info
(
image_processor
,
data_type_key
=
data_type_key
,
mm_count
=
1
,
min_pixels
=
min_pixels
,
_
,
_
,
max_llm_image_tokens
=
_get_vision_info
(
max_pixels
=
max_pixels
)
vision_config
,
height
=
9999999
,
width
=
9999999
,
min_pixels
=
min_pixels
or
image_processor
.
min_pixels
,
max_pixels
=
max_pixels
or
image_processor
.
max_pixels
,
data_type_key
=
data_type_key
,
)
return
max_llm_image_tokens
return
max_llm_image_tokens
...
@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
...
@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
data_type_key
=
"video"
)
data_type_key
=
"video"
)
def
dummy_data_for_qwen2_vl
(
class
Qwen2VLMultiModalDataItems
(
MultiModalDataItems
):
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
)
->
Tuple
[
SequenceData
,
Optional
[
MultiModalDataDict
]]:
mm_processor_kwargs
=
get_mm_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
cached_get_image_processor
(
ctx
.
model_config
.
model
,
**
mm_processor_kwargs
)
num_images
=
mm_counts
[
"image"
]
max_resized_height
,
max_resized_width
,
max_llm_image_tokens
=
\
_get_max_image_info
(
image_processor
,
data_type_key
=
"image"
,
mm_count
=
num_images
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
if
seq_len
-
max_llm_image_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-VL cannot process
{
num_images
}
images in a prompt, "
"please increase max_model_len or reduce image limit by "
"--limit-mm-per-prompt."
)
# Check video counts.
num_videos
=
mm_counts
[
"video"
]
max_resized_height
,
max_resized_width
,
max_llm_video_tokens
=
\
_get_max_image_info
(
image_processor
,
data_type_key
=
"video"
,
mm_count
=
num_videos
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
if
seq_len
-
max_llm_video_tokens
-
2
<
0
:
raise
RuntimeError
(
f
"Qwen2-VL cannot process
{
num_videos
}
videos in a prompt, "
"please increase max_model_len or reduce video limit by "
"--limit-mm-per-prompt."
)
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
dummy_seqdata
=
SequenceData
.
from_prompt_token_counts
(
(
hf_config
.
vision_start_token_id
,
1
),
(
hf_config
.
image_token_id
,
max_llm_image_tokens
),
(
hf_config
.
vision_end_token_id
,
1
),
(
0
,
seq_len
-
max_llm_image_tokens
-
2
),
)
dummy_image
=
Image
.
new
(
"RGB"
,
(
max_resized_width
,
max_resized_height
),
color
=
0
)
return
DummyData
(
dummy_seqdata
,
{
"image"
:
dummy_image
if
num_images
==
1
else
[
dummy_image
]
*
num_images
})
@
staticmethod
def
from_dict
(
data
:
MultiModalDataDict
)
->
"MultiModalDataItems"
:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data
=
Qwen2VLMultiModalDataItems
()
for
k
,
v
in
data
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
(
isinstance
(
v
,
(
dict
,
torch
.
Tensor
))
# type: ignore[assignment]
or
is_list_of
(
v
,
list
))
else
[
v
]
)
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
isinstance
(
v
,
(
dict
,
torch
.
Tensor
,
list
))
else
[
v
]
)
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
def
_get_llm_num_vision_tokens
(
return
multi_data
mm_inputs
:
list
,
data_type_key
:
str
,
image_processor
,
min_pixels
:
int
,
max_pixels
:
int
,
):
"""Get number of vision tokens of multimodal inputs.
This method is derived from `transformers.models.qwen2_vl.
def
get_item_counts
(
self
)
->
Mapping
[
str
,
int
]:
image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
return
{
"""
m
:
(
image
=
to_numpy_array
(
mm_inputs
[
0
])
len
(
items
[
f
"
{
m
}
_grid_thw"
])
# type: ignore
input_data_format
=
infer_channel_dimension_format
(
image
)
if
isinstance
(
items
,
dict
)
else
len
(
items
))
height
,
width
=
get_image_size
(
image
,
channel_dim
=
input_data_format
)
for
m
,
items
in
self
.
items
()
}
_
,
_
,
llm_num_vision_tokens
=
_get_vision_info
(
image_processor
,
height
=
height
,
width
=
width
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
do_resize
=
image_processor
.
do_resize
,
data_type_key
=
data_type_key
,
mm_count
=
len
(
mm_inputs
),
)
return
llm_num_vision_tokens
def
_expand_pad_tokens
(
inputs
:
list
,
token_id
:
int
,
make_batched_fn
:
Callable
,
class
Qwen2VLMultiModalProcessor
(
BaseMultiModalProcessor
):
data_type_key
:
str
,
image_processor
:
Any
,
prompt_token_ids
:
List
[
int
],
min_pixels
:
Optional
[
int
],
max_pixels
:
Optional
[
int
])
->
List
[
int
]:
"""
Expand pad tokens for multi-modal inputs (e.g., images or videos).
Args:
inputs (list): The multi-modal inputs (e.g., images or videos).
token_id (int): The token ID used to represent the multi-modal input.
make_batched_fn (Callable): A function to batch the inputs.
data_type_key (str): The type of the multi-modal input.
image_processor (Any): The image processor used to process the inputs.
prompt_token_ids (List[int]): The list of token IDs in the prompt.
min_pixels (int): min pixels to used for img processing
max_pixels (int): max pixels to be used for img processing
Returns:
List[int]: The list of token IDs for the multi-modal inputs.
"""
indices
=
[
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
if
token
==
token_id
]
inputs
=
make_batched_fn
(
inputs
)
assert
len
(
indices
)
==
len
(
inputs
)
prompt_token_ids_with_data
=
[]
for
cnt
,
data
in
enumerate
(
inputs
):
num_tokens
=
_get_llm_num_vision_tokens
(
[
data
]
if
data_type_key
==
"image"
else
data
,
data_type_key
=
data_type_key
,
image_processor
=
image_processor
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
)
if
cnt
==
0
:
end_idx
=
indices
[
cnt
]
non_data_tokens
=
prompt_token_ids
[:
end_idx
]
else
:
non_data_tokens
=
prompt_token_ids
[
indices
[
cnt
-
1
]
+
1
:
indices
[
cnt
]]
prompt_token_ids_with_data
.
extend
(
non_data_tokens
)
prompt_token_ids_with_data
.
extend
(
token_id
for
_
in
range
(
num_tokens
))
prompt_token_ids_with_data
.
extend
(
prompt_token_ids
[
indices
[
-
1
]
+
1
:])
return
prompt_token_ids_with_data
def
input_processor_for_qwen2_vl
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
DecoderOnlyInputs
:
multi_modal_data
=
inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
:
return
inputs
image_inputs
=
multi_modal_data
.
get
(
"image"
,
None
)
video_inputs
=
multi_modal_data
.
get
(
"video"
,
None
)
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
image_processor
=
processor
.
image_processor
# Apply processor kwarg overrides for image processor options
min_pixels
=
min_pixels
if
min_pixels
else
image_processor
.
min_pixels
max_pixels
=
max_pixels
if
max_pixels
else
image_processor
.
max_pixels
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
(
Qwen2VLConfig
)
# To avoid redundant processing of vision objects (resize, rescale, etc.),
def
_get_mm_items
(
# we extract code of calculating number of vision tokens from
self
,
# `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
mm_data
:
MultiModalDataDict
,
#
)
->
MultiModalDataItems
:
# The following code is equivalent to:
return
Qwen2VLMultiModalDataItems
.
from_dict
(
mm_data
)
# prompt = inputs["prompt"]
# inputs = processor(text=[prompt],
# images=image_inputs,
# videos=video_inputs,
# padding=True,
# return_tensors="pt")
# prompt_token_ids = inputs["input_ids"][0].tolist()
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
)
prompt_token_ids
=
inputs
[
"prompt_token_ids"
]
# Expand image pad tokens.
if
image_inputs
is
not
None
:
if
isinstance
(
image_inputs
,
dict
):
prompt_token_ids_with_image
=
[]
image_indices
=
[
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
if
token
==
hf_config
.
image_token_id
]
# ensure all image tokens have grid_thw
def
_get_hf_processor
(
assert
\
self
,
len
(
image_indices
)
==
image_inputs
[
"image_grid_thw"
].
size
(
0
),
\
*
,
"image token num does not match image_grid_thw.shape"
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
image_counter
=
0
)
->
Qwen2VLProcessor
:
pad_token_counter
=
0
hf_processor
=
self
.
ctx
.
get_hf_processor
(
Qwen2VLProcessor
)
for
idx
,
token
in
enumerate
(
prompt_token_ids
):
image_processor
=
_get_image_processor
(
hf_processor
)
if
idx
in
image_indices
:
grid_thw
=
image_inputs
[
"image_grid_thw"
][
image_counter
]
if
min_pixels
:
grid_t
,
grid_h
,
grid_w
=
grid_thw
image_processor
.
min_pixels
=
min_pixels
num_pad_tokens
=
(
grid_t
*
grid_h
*
grid_w
//
if
max_pixels
:
image_processor
.
merge_size
//
image_processor
.
max_pixels
=
max_pixels
image_processor
.
merge_size
)
if
max_pixels
or
min_pixels
:
prompt_token_ids_with_image
.
extend
([
token
]
*
image_processor
.
size
=
{
num_pad_tokens
)
"min_pixels"
:
image_processor
.
min_pixels
,
image_counter
+=
1
"max_pixels"
:
image_processor
.
max_pixels
,
pad_token_counter
+=
num_pad_tokens
}
return
hf_processor
def
_get_processor_data
(
self
,
mm_items
:
MultiModalDataItems
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
processor_data
=
dict
[
str
,
Any
]()
passthrough_data
=
dict
[
str
,
Any
]()
for
k
,
v
in
mm_items
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
if
k
in
(
"image"
,
"video"
,
"audio"
):
if
isinstance
(
v
,
dict
):
# Pass through embedding inputs (dict)
passthrough_data
.
update
(
v
)
elif
isinstance
(
v
,
torch
.
Tensor
)
and
v
.
ndim
==
3
:
# Pass through embedding inputs (single)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
[
v
]
elif
(
is_list_of
(
v
,
torch
.
Tensor
)
and
len
(
v
)
>
0
and
v
[
0
].
ndim
==
2
):
# Pass through embedding inputs (multi)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
v
else
:
else
:
prompt_token_ids_with_image
.
append
(
token
)
# Map keys to plural form, e.g.: image -> images
processor_data
[
f
"
{
k
}
s"
]
=
v
else
:
processor_data
[
k
]
=
v
# ensure all embeddings are used
return
processor_data
,
passthrough_data
assert
\
pad_token_counter
==
image_inputs
[
"image_embeds"
].
size
(
0
),
\
"image_embeds.shape does not match image_grid_thw"
prompt_token_ids
=
prompt_token_ids_with_image
def
_get_prompt_replacements
(
else
:
self
,
prompt_token_ids
=
_expand_pad_tokens
(
image_inputs
,
mm_items
:
MultiModalDataItems
,
hf_config
.
image_token_id
,
hf_inputs
:
BatchFeature
,
make_batched_images
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
"image"
,
)
->
list
[
PromptReplacement
]:
image_processor
,
hf_processor
=
self
.
_get_hf_processor
()
prompt_token_ids
,
image_processor
=
_get_image_processor
(
hf_processor
)
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
if
video_inputs
is
not
None
:
placeholder
=
{
if
isinstance
(
video_inputs
,
dict
):
"image"
:
hf_processor
.
image_token
,
prompt_token_ids_with_video
=
[]
"video"
:
hf_processor
.
video_token
,
video_indices
=
[
}
idx
for
idx
,
token
in
enumerate
(
prompt_token_ids
)
merge_length
=
image_processor
.
merge_size
**
2
if
token
==
hf_config
.
video_token_id
]
def
get_replacement_qwen2vl
(
item_idx
:
int
,
modality
:
str
):
grid_thw
=
hf_inputs
[
f
"
{
modality
}
_grid_thw"
][
item_idx
]
num_tokens
=
grid_thw
.
prod
()
//
merge_length
return
placeholder
[
modality
]
*
num_tokens
return
[
PromptReplacement
(
modality
=
modality
,
target
=
placeholder
[
modality
],
replacement
=
partial
(
get_replacement_qwen2vl
,
modality
=
modality
),
)
for
modality
in
(
"image"
,
"video"
)
]
# ensure all video tokens have grid_thw
def
_get_dummy_mm_inputs
(
assert
\
self
,
len
(
video_indices
)
==
video_inputs
[
"video_grid_thw"
].
size
(
0
),
\
mm_counts
:
Mapping
[
str
,
int
],
"video token num does not match video_grid_thw.shape"
)
->
ProcessorInputs
:
num_images
=
mm_counts
[
"image"
]
video_counter
=
0
hf_processor
=
self
.
_get_hf_processor
()
pad_token_counter
=
0
image_token
:
str
=
hf_processor
.
image_token
for
idx
,
token
in
enumerate
(
prompt_token_ids
):
image_processor
=
_get_image_processor
(
hf_processor
)
if
idx
in
video_indices
:
grid_thw
=
video_inputs
[
"video_grid_thw"
][
video_counter
]
data
=
{}
grid_t
,
grid_h
,
grid_w
=
grid_thw
resized_height
,
resized_width
=
smart_resize
(
num_pad_tokens
=
(
grid_t
*
grid_h
*
grid_w
//
height
=
9999999
,
image_processor
.
merge_size
//
width
=
9999999
,
image_processor
.
merge_size
)
factor
=
image_processor
.
patch_size
*
image_processor
.
merge_size
,
prompt_token_ids_with_video
.
extend
([
token
]
*
min_pixels
=
image_processor
.
min_pixels
,
num_pad_tokens
)
max_pixels
=
image_processor
.
max_pixels
,
video_counter
+=
1
)
pad_token_counter
+=
num_pad_tokens
else
:
prompt_token_ids_with_video
.
append
(
token
)
# ensure all embeddings are used
dummy_image
=
Image
.
new
(
"RGB"
,
(
resized_width
,
resized_height
),
assert
\
color
=
0
)
pad_token_counter
==
video_inputs
[
"video_embeds"
].
size
(
0
),
\
data
[
"image"
]
=
[
dummy_image
]
*
num_images
"video_embeds.shape does not match video_grid_thw"
prompt_token_ids
=
prompt_token_ids_with_video
return
ProcessorInputs
(
else
:
prompt_text
=
image_token
*
num_images
,
prompt_token_ids
=
_expand_pad_tokens
(
video_inputs
,
mm_data
=
data
,
hf_config
.
video_token_id
,
mm_processor_kwargs
=
{},
make_batched_videos
,
)
"video"
,
image_processor
,
prompt_token_ids
,
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
prompt
=
inputs
.
get
(
"prompt"
)
if
prompt
is
None
:
prompt
=
tokenizer
.
decode
(
prompt_token_ids
)
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
prompt
=
prompt
,
multi_modal_data
=
multi_modal_data
,
)
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
(
image_input_mapper_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_input_mapper
(
"video"
,
video_input_mapper_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_qwen2_vl_image_tokens
)
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_qwen2_vl_image_tokens
)
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
"video"
,
get_max_qwen2_vl_video_tokens
)
"video"
,
get_max_qwen2_vl_video_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_qwen2_vl
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Qwen2VLMultiModalProcessor
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_qwen2_vl
)
class
Qwen2VLForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
class
Qwen2VLForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsLoRA
,
SupportsPP
):
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
packed_modules_mapping
=
{
...
@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
]
]
embedding_modules
=
{}
embedding_modules
=
{}
embedding_padding_modules
=
[]
embedding_padding_modules
=
[]
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
:
Qwen2VLConfig
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
...
@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/registry.py
View file @
96ae75ad
...
@@ -20,11 +20,10 @@ import torch.nn as nn
...
@@ -20,11 +20,10 @@ import torch.nn as nn
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
.adapters
import
as_embedding_model
from
.interfaces
import
(
has_inner_state
,
is_attention_free
,
is_hybrid
,
from
.interfaces
import
(
has_inner_state
,
is_attention_free
,
is_hybrid
,
supports_cross_encoding
,
supports_multimodal
,
supports_cross_encoding
,
supports_multimodal
,
supports_pp
)
supports_pp
)
from
.interfaces_base
import
is_pooling_model
,
is_text_generation_model
from
.interfaces_base
import
is_text_generation_model
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = {
...
@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = {
"DeciLMForCausalLM"
:
(
"decilm"
,
"DeciLMForCausalLM"
),
"DeciLMForCausalLM"
:
(
"decilm"
,
"DeciLMForCausalLM"
),
"DeepseekForCausalLM"
:
(
"deepseek"
,
"DeepseekForCausalLM"
),
"DeepseekForCausalLM"
:
(
"deepseek"
,
"DeepseekForCausalLM"
),
"DeepseekV2ForCausalLM"
:
(
"deepseek_v2"
,
"DeepseekV2ForCausalLM"
),
"DeepseekV2ForCausalLM"
:
(
"deepseek_v2"
,
"DeepseekV2ForCausalLM"
),
"DeepseekV3ForCausalLM"
:
(
"deepseek_v3"
,
"DeepseekV3ForCausalLM"
),
"ExaoneForCausalLM"
:
(
"exaone"
,
"ExaoneForCausalLM"
),
"ExaoneForCausalLM"
:
(
"exaone"
,
"ExaoneForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"GemmaForCausalLM"
:
(
"gemma"
,
"GemmaForCausalLM"
),
"GemmaForCausalLM"
:
(
"gemma"
,
"GemmaForCausalLM"
),
...
@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
...
@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
"Gemma2Model"
:
(
"gemma2"
,
"Gemma2ForCausalLM"
),
"Gemma2Model"
:
(
"gemma2"
,
"Gemma2ForCausalLM"
),
"GlmForCausalLM"
:
(
"glm"
,
"GlmForCausalLM"
),
"GlmForCausalLM"
:
(
"glm"
,
"GlmForCausalLM"
),
"GritLM"
:
(
"gritlm"
,
"GritLM"
),
"GritLM"
:
(
"gritlm"
,
"GritLM"
),
"JambaForSequenceClassification"
:
(
"jamba"
,
"JambaForSequenceClassification"
),
# noqa: E501
"LlamaModel"
:
(
"llama"
,
"LlamaForCausalLM"
),
"LlamaModel"
:
(
"llama"
,
"LlamaForCausalLM"
),
**
{
**
{
# Multiple models share the same architecture, so we include them all
# Multiple models share the same architecture, so we include them all
...
@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = {
...
@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = {
"Qwen2Model"
:
(
"qwen2"
,
"Qwen2EmbeddingModel"
),
"Qwen2Model"
:
(
"qwen2"
,
"Qwen2EmbeddingModel"
),
"Qwen2ForCausalLM"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
"Qwen2ForCausalLM"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
"Qwen2ForRewardModel"
:
(
"qwen2_rm"
,
"Qwen2ForRewardModel"
),
"Qwen2ForRewardModel"
:
(
"qwen2_rm"
,
"Qwen2ForRewardModel"
),
"Qwen2ForSequenceClassification"
:
(
"qwen2_cls"
,
"Qwen2ForSequenceClassification"
),
# noqa: E501
"TeleChat2ForCausalLM"
:
(
"telechat2"
,
"TeleChat2ForCausalLM"
),
"TeleChat2ForCausalLM"
:
(
"telechat2"
,
"TeleChat2ForCausalLM"
),
# [Multimodal]
# [Multimodal]
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# noqa: E501
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
# [Auto-converted (see adapters.py)]
"Qwen2ForSequenceClassification"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
}
}
_CROSS_ENCODER_MODELS
=
{
_CROSS_ENCODER_MODELS
=
{
...
@@ -225,19 +227,10 @@ class _ModelInfo:
...
@@ -225,19 +227,10 @@ class _ModelInfo:
@
staticmethod
@
staticmethod
def
from_model_cls
(
model
:
Type
[
nn
.
Module
])
->
"_ModelInfo"
:
def
from_model_cls
(
model
:
Type
[
nn
.
Module
])
->
"_ModelInfo"
:
is_pooling_model_
=
is_pooling_model
(
model
)
if
not
is_pooling_model_
:
try
:
as_embedding_model
(
model
)
except
Exception
:
pass
else
:
is_pooling_model_
=
True
return
_ModelInfo
(
return
_ModelInfo
(
architecture
=
model
.
__name__
,
architecture
=
model
.
__name__
,
is_text_generation_model
=
is_text_generation_model
(
model
),
is_text_generation_model
=
is_text_generation_model
(
model
),
is_pooling_model
=
is_
pooling
_
model
_
,
is_pooling_model
=
True
,
# Can convert any model into a
pooling
model
supports_cross_encoding
=
supports_cross_encoding
(
model
),
supports_cross_encoding
=
supports_cross_encoding
(
model
),
supports_multimodal
=
supports_multimodal
(
model
),
supports_multimodal
=
supports_multimodal
(
model
),
supports_pp
=
supports_pp
(
model
),
supports_pp
=
supports_pp
(
model
),
...
...
vllm/model_executor/models/telechat2.py
View file @
96ae75ad
...
@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
...
@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
class
TeleChat2Model
(
LlamaModel
):
class
TeleChat2Model
(
LlamaModel
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"transformer."
:
"model."
,
},
orig_to_new_substr
=
{
".h."
:
".layers."
,
".self_attention."
:
".self_attn."
,
".word_embeddings."
:
".embed_tokens."
,
".dense."
:
".o_proj."
,
".ln_f."
:
".norm."
,
},
)
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
# 1. Initialize the LlamaModel with bias
# 1. Initialize the LlamaModel with bias
vllm_config
.
model_config
.
hf_config
.
bias
=
True
vllm_config
.
model_config
.
hf_config
.
bias
=
True
...
@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
...
@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"transformer."
:
"model."
,
},
orig_to_new_substr
=
{
".h."
:
".layers."
,
".self_attention."
:
".self_attn."
,
".word_embeddings."
:
".embed_tokens."
,
".dense."
:
".o_proj."
,
".ln_f."
:
".norm."
,
},
)
loader
=
AutoWeightsLoader
(
loader
=
AutoWeightsLoader
(
self
,
self
,
skip_prefixes
=
([
"lm_head."
]
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
if
self
.
config
.
tie_word_embeddings
else
None
),
)
)
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/ultravox.py
View file @
96ae75ad
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
math
import
math
from
functools
import
cached_property
,
lru_cache
from
functools
import
cached_property
,
lru_cache
from
typing
import
(
Any
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
from
typing
import
(
Any
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
Tuple
,
TypedDict
,
Union
)
import
numpy
as
np
import
numpy
as
np
...
@@ -11,7 +11,7 @@ import torch
...
@@ -11,7 +11,7 @@ import torch
import
torch.utils.checkpoint
import
torch.utils.checkpoint
from
torch
import
nn
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
torch.nn
import
functional
as
F
from
transformers
import
BatchFeature
from
transformers
import
BatchFeature
,
ProcessorMixin
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
transformers.models.whisper.modeling_whisper
import
WhisperEncoder
from
transformers.models.whisper.modeling_whisper
import
WhisperEncoder
...
@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
...
@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
NestedTensors
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
NestedTensors
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
MultiModalDataDict
,
MultiModalDataItems
,
ProcessorInputs
,
MultiModalDataItems
,
ProcessorInputs
,
PromptReplacement
)
PromptReplacement
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
...
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
...
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
def
whisper_feature_extractor
(
ctx
:
InputContext
)
->
WhisperFeatureExtractor
:
def
whisper_feature_extractor
(
ctx
:
InputContext
)
->
WhisperFeatureExtractor
:
return
cached_feature_extractor
(
hf_config
=
ctx
.
get_hf_config
(
UltravoxConfig
)
ctx
.
get_hf_config
(
UltravoxC
onfig
)
.
audio_model_id
)
return
cached_feature_extractor
(
hf_c
onfig
.
audio_model_id
)
def
get_ultravox_max_audio_tokens
(
ctx
:
InputContext
):
def
get_ultravox_max_audio_tokens
(
ctx
:
InputContext
):
...
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
...
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
class
UltravoxMultiModalProcessor
(
BaseMultiModalProcessor
):
class
UltravoxMultiModalProcessor
(
BaseMultiModalProcessor
):
def
_get_feature_extractor
(
self
)
->
WhisperFeatureExtractor
:
def
_get_feature_extractor
(
self
)
->
WhisperFeatureExtractor
:
return
self
.
_get_hf_processor
().
audio_processor
.
feature_extractor
hf_processor
=
self
.
_get_hf_processor
()
return
hf_processor
.
audio_processor
.
feature_extractor
# type: ignore
def
_
resample_audio
(
def
_
get_processor_data
(
self
,
self
,
audio
:
np
.
ndarray
,
mm_items
:
MultiModalDataItems
,
sr
:
int
,
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]:
)
->
Dict
[
str
,
Union
[
np
.
ndarray
,
int
]]:
# resample audio to the model's sampling rate
# resample audio to the model's sampling rate
feature_extractor
=
self
.
_get_feature_extractor
()
feature_extractor
=
self
.
_get_feature_extractor
()
if
sr
!=
feature_extractor
.
sampling_rate
:
mm_items
.
resample_audios
(
feature_extractor
.
sampling_rate
)
try
:
import
librosa
return
super
().
_get_processor_data
(
mm_items
)
except
ImportError
as
exc
:
raise
ImportError
(
def
_call_hf_processor
(
"Please install vllm[audio] for audio support."
)
from
exc
audio
=
librosa
.
resample
(
audio
,
orig_sr
=
sr
,
target_sr
=
feature_extractor
.
sampling_rate
)
sr
=
feature_extractor
.
sampling_rate
return
{
"audio"
:
audio
,
"sampling_rate"
:
sr
}
def
_apply_hf_processor
(
self
,
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
prompt
:
str
,
mm_data
:
MultiModalDataDi
ct
,
processor_data
:
Mapping
[
str
,
obje
ct
]
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
)
->
BatchFeature
:
if
not
mm_data
or
not
mm_data
.
get
(
"audio"
,
None
):
processor_data
=
dict
(
processor_data
)
return
super
().
_apply_hf_processor
(
prompt
,
mm_data
,
audios
=
processor_data
.
pop
(
"audios"
,
[])
mm_processor_kwargs
)
if
not
audios
:
return
super
().
_call_hf_processor
(
hf_processor
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
feature_extractor
=
self
.
_get_feature_extractor
()
mm_processor_kwargs
=
dict
(
**
mm_processor_kwargs
,
sampling_rate
=
feature_extractor
.
sampling_rate
,
)
audio_data
=
mm_data
[
"audio"
]
# Already resampled by _get_processor_data
if
not
isinstance
(
audio_data
,
list
):
assert
is_list_of
(
audios
,
np
.
ndarray
)
audio_data
=
[
audio_data
]
# Ultravox processor doesn't support multiple inputs,
# Ultravox processor doesn't support multiple inputs,
# therefore we need to input text and audio one by one
# therefore we need to input text and audio one by one
tokenizer
=
self
.
_get_tokenizer
()
audio_features
,
audio_token_len
=
[],
[]
audio_features
,
audio_token_len
=
[],
[]
processed_inputs
=
{}
shared_outputs
=
{}
for
audio
,
sr
in
audio_data
:
for
audio
in
audios
:
data
=
self
.
_resample_audio
(
audio
,
sr
)
# NOTE: Ultravox processor accepts "audio" instead of "audios"
processed_inputs
=
super
().
_apply_hf_processor
(
item_processor_data
=
dict
(
**
processor_data
,
audio
=
audio
)
prompt
,
data
,
mm_processor_kwargs
)
prompt
=
tokenizer
.
decode
(
processed_inputs
[
"input_ids"
][
0
],
item_outputs
=
super
().
_call_hf_processor
(
skip_special_tokens
=
False
)
hf_processor
,
audio_features
.
append
(
prompt
=
prompt
,
processed_inputs
.
pop
(
"audio_values"
).
squeeze
(
0
))
processor_data
=
item_processor_data
,
audio_token_len
.
append
(
mm_processor_kwargs
=
mm_processor_kwargs
,
processed_inputs
.
pop
(
"audio_token_len"
).
item
())
)
return
dict
(
audio_features
.
append
(
item_outputs
.
pop
(
"audio_values"
)[
0
])
**
processed_inputs
,
audio_token_len
.
append
(
item_outputs
.
pop
(
"audio_token_len"
).
item
())
shared_outputs
=
item_outputs
combined_outputs
=
dict
(
**
shared_outputs
,
audio_features
=
audio_features
,
audio_features
=
audio_features
,
audio_token_len
=
audio_token_len
,
audio_token_len
=
audio_token_len
,
)
)
return
BatchFeature
(
combined_outputs
)
def
_get_processor_data
(
self
,
mm_data
:
MultiModalDataDict
,
)
->
Tuple
[
Dict
[
str
,
Any
],
Dict
[
str
,
Any
]]:
# Ultravox uses "audio" instead of "audios" as calling keyword
processor_data
,
passthrough_data
=
super
().
_get_processor_data
(
mm_data
)
if
"audios"
in
processor_data
:
processor_data
[
"audio"
]
=
processor_data
.
pop
(
"audios"
)
return
processor_data
,
passthrough_data
def
_get_prompt_replacements
(
def
_get_prompt_replacements
(
self
,
self
,
...
@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
...
@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
mm_processor_kwargs
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
list
[
PromptReplacement
]:
)
->
list
[
PromptReplacement
]:
hf_processor
=
self
.
_get_hf_processor
()
hf_processor
=
self
.
_get_hf_processor
()
placeholder
=
hf_processor
.
audio_token_replacement
placeholder
=
hf_processor
.
audio_token_replacement
# type: ignore
def
get_replacement_ultravox
(
item_idx
:
int
):
def
get_replacement_ultravox
(
item_idx
:
int
):
audio_token_len
=
hf_inputs
[
"audio_token_len"
][
item_idx
]
audio_token_len
=
hf_inputs
[
"audio_token_len"
][
item_idx
]
...
@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
...
@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
audio_count
=
mm_counts
[
"audio"
]
audio_count
=
mm_counts
[
"audio"
]
audio
=
np
.
zeros
(
audio_len
)
audio
=
np
.
zeros
(
audio_len
)
data
=
{
"audio"
:
[
(
audio
,
sampling_rate
)
]
*
audio_count
}
data
=
{
"audio"
:
[
audio
]
*
audio_count
}
return
ProcessorInputs
(
return
ProcessorInputs
(
prompt_text
=
"<|audio|>"
*
audio_count
,
prompt_text
=
"<|audio|>"
*
audio_count
,
...
@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
...
@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
@
MULTIMODAL_REGISTRY
.
register_processor
(
UltravoxMultiModalProcessor
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
UltravoxMultiModalProcessor
)
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_config
...
@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
loader
=
AutoWeightsLoader
(
self
,
loader
=
AutoWeightsLoader
(
self
,
ignore_unexpected_prefixes
=
[
"audio_tower."
])
ignore_unexpected_prefixes
=
[
"audio_tower."
])
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/parameter.py
View file @
96ae75ad
...
@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
...
@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
marlin_tile_size
=
self
.
marlin_tile_size
)
marlin_tile_size
=
self
.
marlin_tile_size
)
class
BlockQuantScaleParameter
(
_ColumnvLLMParameter
,
RowvLLMParameter
):
"""
Parameter class for weight scales loaded for weights with
block-wise quantization. Uses both column and row parallelism.
"""
pass
def
permute_param_layout_
(
param
:
BasevLLMParameter
,
input_dim
:
int
,
def
permute_param_layout_
(
param
:
BasevLLMParameter
,
input_dim
:
int
,
output_dim
:
int
,
**
kwargs
)
->
BasevLLMParameter
:
output_dim
:
int
,
**
kwargs
)
->
BasevLLMParameter
:
"""
"""
...
...
vllm/multimodal/__init__.py
View file @
96ae75ad
...
@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to
...
@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to
dispatch data processing according to its modality and the target model.
dispatch data processing according to its modality and the target model.
See also:
See also:
:ref:`input
_
processing
_
pipeline`
:ref:`input
-
processing
-
pipeline`
"""
"""
__all__
=
[
__all__
=
[
...
...
vllm/multimodal/audio.py
View file @
96ae75ad
import
numpy
as
np
import
numpy.typing
as
npt
from
vllm.inputs.registry
import
InputContext
from
vllm.inputs.registry
import
InputContext
from
vllm.utils
import
PlaceholderModule
from
.base
import
MultiModalPlugin
from
.base
import
MultiModalPlugin
from
.inputs
import
AudioItem
,
MultiModalData
,
MultiModalKwargs
from
.inputs
import
AudioItem
,
MultiModalData
,
MultiModalKwargs
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
class
AudioPlugin
(
MultiModalPlugin
):
class
AudioPlugin
(
MultiModalPlugin
):
"""Plugin for audio data."""
"""Plugin for audio data."""
...
@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin):
...
@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"There is no default maximum multimodal tokens"
)
"There is no default maximum multimodal tokens"
)
def
resample_audio
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
librosa
.
resample
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
vllm/multimodal/base.py
View file @
96ae75ad
...
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
...
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
(i.e., the modality of the data).
(i.e., the modality of the data).
See also:
See also:
:ref:`adding
_
multimodal
_
plugin`
:ref:`adding
-
multimodal
-
plugin`
"""
"""
def
__init__
(
self
)
->
None
:
def
__init__
(
self
)
->
None
:
...
@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC):
...
@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default input mapper is used instead.
If `None` is provided, then the default input mapper is used instead.
See also:
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`enabling
-
multimodal
-
inputs`
"""
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
def
wrapper
(
model_cls
:
N
)
->
N
:
...
@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC):
...
@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC):
TypeError: If the data type is not supported.
TypeError: If the data type is not supported.
See also:
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`enabling
-
multimodal
-
inputs`
"""
"""
# Avoid circular import
# Avoid circular import
...
@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC):
...
@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default calculation is used instead.
If `None` is provided, then the default calculation is used instead.
See also:
See also:
:ref:`enabling
_
multimodal
_
inputs`
:ref:`enabling
-
multimodal
-
inputs`
"""
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
def
wrapper
(
model_cls
:
N
)
->
N
:
...
@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC):
...
@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC):
The model is identified by ``model_config``.
The model is identified by ``model_config``.
See also:
See also:
:ref:`enabling
_
multimodal
_
inputs`
:ref:`enabling
-
multimodal
-
inputs`
"""
"""
# Avoid circular import
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
from
vllm.model_executor.model_loader
import
get_model_architecture
...
...
vllm/multimodal/image.py
View file @
96ae75ad
...
@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
...
@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
3000
return
3000
def
rescale_image_size
(
image
:
Image
.
Image
,
size_factor
:
float
,
transpose
:
int
=
-
1
)
->
Image
.
Image
:
"""Rescale the dimensions of an image by a constant factor."""
new_width
=
int
(
image
.
width
*
size_factor
)
new_height
=
int
(
image
.
height
*
size_factor
)
image
=
image
.
resize
((
new_width
,
new_height
))
if
transpose
>=
0
:
image
=
image
.
transpose
(
Image
.
Transpose
(
transpose
))
return
image
vllm/multimodal/inputs.py
View file @
96ae75ad
...
@@ -15,31 +15,32 @@ _T = TypeVar("_T")
...
@@ -15,31 +15,32 @@ _T = TypeVar("_T")
# yapf: disable
# yapf: disable
ImageItem
:
TypeAlias
=
Union
[
Image
,
np
.
ndarray
,
torch
.
Tensor
]
ImageItem
:
TypeAlias
=
Union
[
Image
,
np
.
ndarray
,
torch
.
Tensor
]
"""
"""
A :class:`transformers.image_utils.ImageInput` representing a single image
,
A :class:`transformers.image_utils.ImageInput` representing a single image
which can be passed to a HuggingFace :code:`ImageProcessor`.
item,
which can be passed to a HuggingFace :code:`ImageProcessor`.
"""
"""
VideoItem
:
TypeAlias
=
Union
[
VideoItem
:
TypeAlias
=
Union
[
L
ist
[
Image
],
l
ist
[
Image
],
np
.
ndarray
,
np
.
ndarray
,
torch
.
Tensor
,
torch
.
Tensor
,
L
ist
[
np
.
ndarray
],
l
ist
[
np
.
ndarray
],
L
ist
[
torch
.
Tensor
],
l
ist
[
torch
.
Tensor
],
]
]
"""
"""
A :class:`transformers.image_utils.VideoInput` representing a single video
A :class:`transformers.image_utils.VideoInput` representing a single video,
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
which can be passed to a HuggingFace :code:`VideoProcessor`.
"""
"""
AudioItem
:
TypeAlias
=
Union
[
AudioItem
:
TypeAlias
=
Union
[
np
.
ndarray
,
np
.
ndarray
,
List
[
float
],
list
[
float
],
Tuple
[
np
.
ndarray
,
float
],
# DEPRECATED: Use mm_processor_kwargs instead
# `(audio, sampling_rate)`: If the audio's sampling rate is different
# from that expected by the model, we need to resample it.
tuple
[
np
.
ndarray
,
float
],
]
]
"""
"""
Represents a single audio
that can be inputted to a HuggingFace
Represents a single audio
:code:`AudioProcessor`.
item, which can be passed to a HuggingFace
:code:`AudioProcessor`.
"""
"""
# yapf: enable
# yapf: enable
...
@@ -74,7 +75,7 @@ Note:
...
@@ -74,7 +75,7 @@ Note:
This dictionary also accepts modality keys defined outside
This dictionary also accepts modality keys defined outside
:class:`MultiModalDataBuiltins` as long as a customized plugin
:class:`MultiModalDataBuiltins` as long as a customized plugin
is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
Read more on that :ref:`here <adding
_
multimodal
_
plugin>`.
Read more on that :ref:`here <adding
-
multimodal
-
plugin>`.
"""
"""
...
@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
...
@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
mm_kwargs
:
MultiModalKwargs
mm_kwargs
:
MultiModalKwargs
"""Keyword arguments to be directly passed to the model after batching."""
"""Keyword arguments to be directly passed to the model after batching."""
mm_hashes
:
NotRequired
[
List
[
str
]]
"""The hashes of the multi-modal data."""
mm_placeholders
:
MultiModalPlaceholderDict
mm_placeholders
:
MultiModalPlaceholderDict
"""
"""
For each modality, information about the placeholder tokens in
For each modality, information about the placeholder tokens in
...
...
vllm/multimodal/processing.py
View file @
96ae75ad
...
@@ -17,6 +17,7 @@ from vllm.logger import init_logger
...
@@ -17,6 +17,7 @@ from vllm.logger import init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
MistralTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
MistralTokenizer
from
vllm.utils
import
flatten_2d_lists
,
full_groupby
,
is_list_of
from
vllm.utils
import
flatten_2d_lists
,
full_groupby
,
is_list_of
from
.audio
import
resample_audio
from
.inputs
import
(
AudioItem
,
ImageItem
,
MultiModalDataDict
,
from
.inputs
import
(
AudioItem
,
ImageItem
,
MultiModalDataDict
,
MultiModalInputsV2
,
MultiModalKwargs
,
PlaceholderRange
,
MultiModalInputsV2
,
MultiModalKwargs
,
PlaceholderRange
,
VideoItem
)
VideoItem
)
...
@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]]
...
@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]]
@
dataclass
@
dataclass
class
PromptReplacement
:
class
PromptReplacement
:
modality
:
str
modality
:
str
"""The modality for which the replacement is made"""
"""The modality for which the replacement is made
.
"""
target
:
_PromptSeq
target
:
_PromptSeq
"""The text or token sequence to find and replace."""
"""The text or token sequence to find and replace."""
...
@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
...
@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
corresponds to a list.
corresponds to a list.
"""
"""
@
staticmethod
def
from_dict
(
data
:
MultiModalDataDict
)
->
"MultiModalDataItems"
:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data
=
MultiModalDataItems
()
for
k
,
v
in
data
.
items
():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
(
isinstance
(
v
,
torch
.
Tensor
)
or
is_list_of
(
v
,
list
))
else
[
v
]
)
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
(
# type: ignore[index]
v
if
isinstance
(
v
,
(
torch
.
Tensor
,
list
))
else
[
v
]
)
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
return
multi_data
# NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
# `self.images` doesn't update this dictionary, which may be confusing
# We annotate the getter methods as `Sequence` to prevent others from
# trying to update the list in this way
@
property
@
property
def
image
(
self
)
->
list
[
ImageItem
]:
def
image
s
(
self
)
->
Sequence
[
ImageItem
]:
return
self
[
"image"
]
return
self
.
get
(
"image"
,
[])
@
property
@
property
def
video
(
self
)
->
list
[
VideoItem
]:
def
video
s
(
self
)
->
Sequence
[
VideoItem
]:
return
self
[
"video"
]
return
self
.
get
(
"video"
,
[])
@
property
@
property
def
audio
(
self
)
->
list
[
AudioItem
]:
def
audios
(
self
)
->
Sequence
[
AudioItem
]:
return
self
[
"audio"
]
return
self
.
get
(
"audio"
,
[])
def
get_item_counts
(
self
)
->
Mapping
[
str
,
int
]:
return
{
m
:
len
(
items
)
for
m
,
items
in
self
.
items
()}
def
get_image_size
(
self
,
item_idx
:
int
)
->
ImageSize
:
def
get_image_size
(
self
,
item_idx
:
int
)
->
ImageSize
:
image
=
self
.
image
[
item_idx
]
image
=
self
.
image
s
[
item_idx
]
if
isinstance
(
image
,
Image
):
if
isinstance
(
image
,
Image
):
return
ImageSize
(
*
image
.
size
)
return
ImageSize
(
*
image
.
size
)
...
@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
...
@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
assert_never
(
image
)
assert_never
(
image
)
def
get_audio_with_sr
(
self
,
item_idx
:
int
,
*
,
default_sr
:
float
,
)
->
tuple
[
np
.
ndarray
,
float
]:
audio
=
self
.
audios
[
item_idx
]
if
isinstance
(
audio
,
tuple
):
return
audio
if
isinstance
(
audio
,
list
):
return
np
.
array
(
audio
),
default_sr
if
isinstance
(
audio
,
np
.
ndarray
):
return
audio
,
default_sr
assert_never
(
audio
)
def
resample_audios
(
self
,
new_sr
:
float
,
*
,
drop_sr
:
bool
=
True
)
->
None
:
"""
If :code:`drop_sr=True`, the audio items in this dictionary are updated
to be NumPy arrays which implicitly means that their sampling rate is
the same as the model's expected sampling rate; otherwise, they remain
as :code:`(audio, new_sr)` tuples.
"""
if
not
self
.
audios
:
return
def
to_multi_format
(
data
:
MultiModalDataDict
)
->
MultiModalDataItems
:
new_audios
=
[]
"""
for
item_idx
in
range
(
len
(
self
.
audios
)):
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
audio
,
sr
=
self
.
get_audio_with_sr
(
item_idx
,
default_sr
=
new_sr
)
"""
audio
=
resample_audio
(
audio
,
orig_sr
=
sr
,
target_sr
=
new_sr
)
multi_data
=
MultiModalDataItems
()
new_audios
.
append
(
audio
if
drop_sr
else
(
audio
,
new_sr
))
for
k
,
v
in
data
.
items
():
# yapf: disable
if
k
==
"video"
:
# Special case since even a single item can be a list
multi_data
[
k
]
=
v
if
is_list_of
(
v
,
list
)
else
[
v
]
# type: ignore[index]
elif
k
in
(
"image"
,
"audio"
):
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
else
:
multi_data
[
k
]
=
v
if
isinstance
(
v
,
list
)
else
[
v
]
# type: ignore[index]
# yapf: enable
return
multi_data
self
[
"audio"
]
=
new_audios
class
_TokenMatch
(
NamedTuple
):
class
_TokenMatch
(
NamedTuple
):
...
@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
...
@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
def
_get_tokenizer
(
self
)
->
AnyTokenizer
:
def
_get_tokenizer
(
self
)
->
AnyTokenizer
:
return
self
.
ctx
.
tokenizer
return
self
.
ctx
.
tokenizer
def
_get_mm_items
(
self
,
mm_data
:
MultiModalDataDict
,
)
->
MultiModalDataItems
:
return
MultiModalDataItems
.
from_dict
(
mm_data
)
@
abstractmethod
@
abstractmethod
def
_get_prompt_replacements
(
def
_get_prompt_replacements
(
self
,
self
,
...
@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC):
...
@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC):
def
_get_processor_data
(
def
_get_processor_data
(
self
,
self
,
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
)
->
BatchFeature
:
)
->
tuple
[
dict
[
str
,
Any
],
dict
[
str
,
Any
]]
:
processor_data
=
dict
[
str
,
Any
]()
processor_data
=
dict
[
str
,
Any
]()
passthrough_data
=
dict
[
str
,
Any
]()
passthrough_data
=
dict
[
str
,
Any
]()
for
k
,
v
in
mm_data
.
items
():
for
k
,
v
in
mm_items
.
items
():
# TODO: Make a separate modality for embedding inputs
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# to avoid confusion
if
k
in
(
"image"
,
"video"
,
"audio"
):
if
k
in
(
"image"
,
"video"
,
"audio"
):
if
isinstance
(
v
,
torch
.
Tensor
)
and
v
.
ndim
==
3
:
if
isinstance
(
v
,
torch
.
Tensor
)
and
v
.
ndim
==
3
:
# Pass through embedding inputs (single)
# Pass through embedding inputs (single)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
[
v
]
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
[
v
]
elif
is_list_of
(
v
,
torch
.
Tensor
)
and
v
[
0
].
ndim
==
2
:
elif
(
is_list_of
(
v
,
torch
.
Tensor
)
and
len
(
v
)
>
0
and
v
[
0
].
ndim
==
2
):
# Pass through embedding inputs (multi)
# Pass through embedding inputs (multi)
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
v
passthrough_data
[
f
"
{
k
}
_embeds"
]
=
v
else
:
else
:
...
@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC):
...
@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC):
processor_data
[
f
"
{
k
}
s"
]
=
v
processor_data
[
f
"
{
k
}
s"
]
=
v
else
:
else
:
processor_data
[
k
]
=
v
processor_data
[
k
]
=
v
return
processor_data
,
passthrough_data
return
processor_data
,
passthrough_data
def
_call_hf_processor
(
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
processor_data
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
return
self
.
ctx
.
call_hf_processor
(
hf_processor
,
prompt
,
processor_data
,
mm_processor_kwargs
,
)
def
_apply_hf_processor
(
def
_apply_hf_processor
(
self
,
self
,
prompt
:
str
,
prompt
:
str
,
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
)
->
BatchFeature
:
# some mm_processor_kwargs may be used in processor initialization
# some mm_processor_kwargs may be used in processor initialization
# instead of processor call
# instead of processor call
hf_processor
=
self
.
_get_hf_processor
(
**
mm_processor_kwargs
)
hf_processor
=
self
.
_get_hf_processor
(
**
mm_processor_kwargs
)
processor_data
,
passthrough_data
=
self
.
_get_processor_data
(
mm_
data
)
processor_data
,
passthrough_data
=
self
.
_get_processor_data
(
mm_
items
)
assert
callable
(
hf_processor
)
hf_inputs
=
self
.
_call_hf_processor
(
mm_processor_kwargs
=
self
.
ctx
.
resolve_hf_processor_call_kwargs
(
hf_processor
,
hf_processor
,
mm_processor_kwargs
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
)
try
:
hf_inputs
=
hf_processor
(
text
=
prompt
,
# type: ignore
**
processor_data
,
**
mm_processor_kwargs
,
return_tensors
=
"pt"
,
)
except
Exception
as
exc
:
data
=
dict
(
text
=
prompt
,
**
processor_data
)
raise
RuntimeError
(
f
"Failed to apply
{
type
(
hf_processor
).
__name__
}
"
f
"on data=
{
data
}
with kwargs=
{
mm_processor_kwargs
}
"
)
from
exc
hf_inputs
.
update
(
passthrough_data
)
hf_inputs
.
update
(
passthrough_data
)
return
hf_inputs
return
hf_inputs
...
@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC):
...
@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC):
3. Extract information about the placeholder tokens from the
3. Extract information about the placeholder tokens from the
processed token IDs.
processed token IDs.
"""
"""
tokenizer
=
self
.
_get_
tokenizer
(
)
mm_items
=
self
.
_get_
mm_items
(
mm_data
)
hf_inputs
=
self
.
_apply_hf_processor
(
prompt_text
,
mm_
data
,
hf_inputs
=
self
.
_apply_hf_processor
(
prompt_text
,
mm_
items
,
mm_processor_kwargs
)
mm_processor_kwargs
)
prompt_ids
,
=
hf_inputs
.
pop
(
"input_ids"
).
tolist
()
prompt_ids
,
=
hf_inputs
.
pop
(
"input_ids"
).
tolist
()
mm_kwargs
=
MultiModalKwargs
(
hf_inputs
)
mm_kwargs
=
MultiModalKwargs
(
hf_inputs
)
mm_items
=
to_multi_format
(
mm_data
)
prompt_repls
=
self
.
_get_prompt_replacements
(
mm_items
,
hf_inputs
,
prompt_repls
=
self
.
_get_prompt_replacements
(
mm_items
,
hf_inputs
,
mm_processor_kwargs
)
mm_processor_kwargs
)
all_prompt_repls
=
self
.
_bind_prompt_replacements
(
prompt_repls
)
all_prompt_repls
=
self
.
_bind_prompt_replacements
(
prompt_repls
)
# If HF processor already inserts placeholder tokens,
# If HF processor already inserts placeholder tokens,
# there is no need for us to insert them
# there is no need for us to insert them
mm_item_counts
=
{
m
:
len
(
items
)
for
m
,
items
in
mm_items
.
item
s
()
}
mm_item_counts
=
mm_items
.
get_item_count
s
()
all_placeholders
=
self
.
_find_placeholders
(
all_prompt_repls
,
all_placeholders
=
self
.
_find_placeholders
(
all_prompt_repls
,
prompt_ids
,
mm_item_counts
)
prompt_ids
,
mm_item_counts
)
if
all_placeholders
:
if
all_placeholders
:
tokenizer
=
self
.
_get_tokenizer
()
prompt_text
=
_decode
(
tokenizer
,
prompt_ids
)
prompt_text
=
_decode
(
tokenizer
,
prompt_ids
)
else
:
else
:
(
(
...
...
vllm/multimodal/registry.py
View file @
96ae75ad
...
@@ -76,7 +76,7 @@ class MultiModalRegistry:
...
@@ -76,7 +76,7 @@ class MultiModalRegistry:
Register a multi-modal plugin so it can be recognized by vLLM.
Register a multi-modal plugin so it can be recognized by vLLM.
See also:
See also:
:ref:`adding
_
multimodal
_
plugin`
:ref:`adding
-
multimodal
-
plugin`
"""
"""
data_type_key
=
plugin
.
get_data_key
()
data_type_key
=
plugin
.
get_data_key
()
...
@@ -311,8 +311,8 @@ class MultiModalRegistry:
...
@@ -311,8 +311,8 @@ class MultiModalRegistry:
invoked to transform the data into a dictionary of model inputs.
invoked to transform the data into a dictionary of model inputs.
See also:
See also:
- :ref:`input
_
processing
_
pipeline`
- :ref:`input
-
processing
-
pipeline`
- :ref:`enabling
_
multimodal
_
inputs`
- :ref:`enabling
-
multimodal
-
inputs`
"""
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
def
wrapper
(
model_cls
:
N
)
->
N
:
...
...
vllm/multimodal/utils.py
View file @
96ae75ad
...
@@ -2,7 +2,7 @@ import base64
...
@@ -2,7 +2,7 @@ import base64
import
os
import
os
from
functools
import
lru_cache
from
functools
import
lru_cache
from
io
import
BytesIO
from
io
import
BytesIO
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
TypeVar
,
Union
from
typing
import
List
,
Optional
,
Tuple
,
TypeVar
,
Union
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
...
@@ -14,9 +14,25 @@ import vllm.envs as envs
...
@@ -14,9 +14,25 @@ import vllm.envs as envs
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_tokenizer
from
vllm.utils
import
PlaceholderModule
from
.inputs
import
MultiModalDataDict
,
PlaceholderRange
from
.inputs
import
MultiModalDataDict
,
PlaceholderRange
try
:
import
decord
except
ImportError
:
decord
=
PlaceholderModule
(
"decord"
)
# type: ignore[assignment]
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
...
@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str,
...
@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str,
return
image
.
convert
(
image_mode
)
return
image
.
convert
(
image_mode
)
def
_load_video_frames_from_bytes
(
b
:
bytes
):
def
_load_video_from_bytes
(
b
:
bytes
,
num_frames
:
int
=
32
)
->
npt
.
NDArray
:
frame
=
Image
.
open
(
BytesIO
(
b
))
return
np
.
array
(
frame
)
def
load_video_frames_from_base64
(
frame
:
Union
[
bytes
,
str
]):
"""Load frame from base64 format."""
return
_load_video_frames_from_bytes
(
base64
.
b64decode
(
frame
))
def
_load_video_from_bytes
(
b
:
bytes
,
num_frames
:
int
=
32
):
_
,
decord
=
try_import_video_packages
()
video_path
=
BytesIO
(
b
)
video_path
=
BytesIO
(
b
)
vr
=
decord
.
VideoReader
(
video_path
,
num_threads
=
1
)
vr
=
decord
.
VideoReader
(
video_path
,
num_threads
=
1
)
total_frame_num
=
len
(
vr
)
total_frame_num
=
len
(
vr
)
...
@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
...
@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
return
frames
return
frames
def
_load_video_from_data_url
(
video_url
:
str
):
def
_load_video_from_data_url
(
video_url
:
str
)
->
npt
.
NDArray
:
# Only split once and assume the second part is the base64 encoded image
# Only split once and assume the second part is the base64 encoded video
frames_base64
=
video_url
.
split
(
","
)[
1
:]
_
,
video_base64
=
video_url
.
split
(
","
,
1
)
return
np
.
stack
([
load_video_frames_from_base64
(
frame_base64
)
if
video_url
.
startswith
(
"data:video/jpeg;"
):
for
frame_base64
in
frames_base64
return
np
.
stack
([
])
np
.
array
(
load_image_from_base64
(
frame_base64
))
for
frame_base64
in
video_base64
.
split
(
","
)
])
return
load_video_from_base64
(
video_base64
)
def
fetch_video
(
video_url
:
str
,
*
,
num_frames
:
int
=
32
)
->
npt
.
NDArray
:
def
fetch_video
(
video_url
:
str
,
*
,
num_frames
:
int
=
32
)
->
npt
.
NDArray
:
...
@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str,
...
@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str,
return
video
return
video
def
try_import_audio_packages
()
->
Tuple
[
Any
,
Any
]:
try
:
import
librosa
import
soundfile
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[audio] for audio support."
)
from
exc
return
librosa
,
soundfile
def
fetch_audio
(
audio_url
:
str
)
->
Tuple
[
np
.
ndarray
,
Union
[
int
,
float
]]:
def
fetch_audio
(
audio_url
:
str
)
->
Tuple
[
np
.
ndarray
,
Union
[
int
,
float
]]:
"""
"""
Load audio from a URL.
Load audio from a URL.
"""
"""
librosa
,
_
=
try_import_audio_packages
()
if
audio_url
.
startswith
(
"http"
):
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
global_http_connection
.
get_bytes
(
audio_bytes
=
global_http_connection
.
get_bytes
(
audio_url
,
audio_url
,
...
@@ -253,8 +249,6 @@ async def async_fetch_audio(
...
@@ -253,8 +249,6 @@ async def async_fetch_audio(
"""
"""
Asynchronously fetch audio from a URL.
Asynchronously fetch audio from a URL.
"""
"""
librosa
,
_
=
try_import_audio_packages
()
if
audio_url
.
startswith
(
"http"
):
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
await
global_http_connection
.
async_get_bytes
(
audio_bytes
=
await
global_http_connection
.
async_get_bytes
(
audio_url
,
audio_url
,
...
@@ -313,8 +307,6 @@ def encode_audio_base64(
...
@@ -313,8 +307,6 @@ def encode_audio_base64(
sampling_rate
:
int
,
sampling_rate
:
int
,
)
->
str
:
)
->
str
:
"""Encode audio as base64."""
"""Encode audio as base64."""
_
,
soundfile
=
try_import_audio_packages
()
buffered
=
BytesIO
()
buffered
=
BytesIO
()
soundfile
.
write
(
buffered
,
audio
,
sampling_rate
,
format
=
"WAV"
)
soundfile
.
write
(
buffered
,
audio
,
sampling_rate
,
format
=
"WAV"
)
...
@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
...
@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
return
_load_image_from_bytes
(
base64
.
b64decode
(
image
))
return
_load_image_from_bytes
(
base64
.
b64decode
(
image
))
def
rescale_image_size
(
image
:
Image
.
Image
,
def
encode_video_base64
(
frames
:
npt
.
NDArray
)
->
str
:
size_factor
:
float
,
transpose
:
int
=
-
1
)
->
Image
.
Image
:
"""Rescale the dimensions of an image by a constant factor."""
new_width
=
int
(
image
.
width
*
size_factor
)
new_height
=
int
(
image
.
height
*
size_factor
)
image
=
image
.
resize
((
new_width
,
new_height
))
if
transpose
>=
0
:
image
=
image
.
transpose
(
Image
.
Transpose
(
transpose
))
return
image
def
try_import_video_packages
()
->
Any
:
try
:
import
cv2
import
decord
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[video] for video support."
)
from
exc
return
cv2
,
decord
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
Tuple
[
int
,
int
])
->
npt
.
NDArray
:
cv2
,
_
=
try_import_video_packages
()
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
else
:
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
def
encode_video_base64
(
frames
:
npt
.
NDArray
):
base64_frames
=
[]
base64_frames
=
[]
frames_list
=
[
frames
[
i
]
for
i
in
range
(
frames
.
shape
[
0
])]
frames_list
=
[
frames
[
i
]
for
i
in
range
(
frames
.
shape
[
0
])]
for
frame
in
frames_list
:
for
frame
in
frames_list
:
...
@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray):
...
@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray):
return
","
.
join
(
base64_frames
)
return
","
.
join
(
base64_frames
)
def
load_video_from_base64
(
video
:
Union
[
bytes
,
str
])
->
npt
.
NDArray
:
"""Load video from base64 format."""
return
_load_video_from_bytes
(
base64
.
b64decode
(
video
))
def
resolve_visual_encoder_outputs
(
def
resolve_visual_encoder_outputs
(
encoder_outputs
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
encoder_outputs
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
feature_sample_layers
:
Optional
[
list
[
int
]],
feature_sample_layers
:
Optional
[
list
[
int
]],
...
...
vllm/multimodal/video.py
View file @
96ae75ad
from
functools
import
lru_cache
from
functools
import
lru_cache
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
import
cv2
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
from
vllm.inputs.registry
import
InputContext
from
vllm.inputs.registry
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
...
@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
4096
return
4096
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
tuple
[
int
,
int
])
->
npt
.
NDArray
:
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
vllm/outputs.py
View file @
96ae75ad
...
@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]):
...
@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]):
pooled_data
=
seq_group
.
pooled_data
pooled_data
=
seq_group
.
pooled_data
assert
pooled_data
is
not
None
assert
pooled_data
is
not
None
output
=
PoolingOutput
(
pooled_data
)
data
=
pooled_data
.
to
(
dtype
=
torch
.
float32
,
device
=
"cpu"
)
output
=
PoolingOutput
(
data
)
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_token_ids
=
seq_group
.
prompt_token_ids
finished
=
seq_group
.
is_finished
()
finished
=
seq_group
.
is_finished
()
...
...
vllm/platforms/cpu.py
View file @
96ae75ad
...
@@ -54,7 +54,7 @@ class CpuPlatform(Platform):
...
@@ -54,7 +54,7 @@ class CpuPlatform(Platform):
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.utils
import
GiB_bytes
from
vllm.utils
import
GiB_bytes
model_config
=
vllm_config
.
model_config
model_config
=
vllm_config
.
model_config
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
not
model_config
.
enforce_eager
:
if
not
model_config
.
enforce_eager
:
logger
.
warning
(
logger
.
warning
(
...
...
vllm/scripts.py
View file @
96ae75ad
...
@@ -165,7 +165,7 @@ def main():
...
@@ -165,7 +165,7 @@ def main():
required
=
False
,
required
=
False
,
help
=
"Read CLI options from a config file."
help
=
"Read CLI options from a config file."
"Must be a YAML with the following options:"
"Must be a YAML with the following options:"
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#c
ommand-line-arguments-for-the-server
"
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#c
li-reference
"
)
)
serve_parser
=
make_arg_parser
(
serve_parser
)
serve_parser
=
make_arg_parser
(
serve_parser
)
serve_parser
.
set_defaults
(
dispatch_function
=
serve
)
serve_parser
.
set_defaults
(
dispatch_function
=
serve
)
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
96ae75ad
...
@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
...
@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
return
spec_decode_worker
return
spec_decode_worker
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
class
SpecDecodeWorker
(
LoraNotSupportedWorkerBase
):
class
SpecDecodeWorker
(
LoraNotSupportedWorkerBase
):
"""Worker which implements speculative decoding.
"""Worker which implements speculative decoding.
...
...
Prev
1
…
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment