Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9f909b89
Unverified
Commit
9f909b89
authored
Aug 12, 2025
by
dongluw
Committed by
GitHub
Aug 12, 2025
Browse files
[New Model] Support Command-A-Vision (#22660)
Signed-off-by:
donglu
<
donglu@cohere.com
>
parent
59f3b936
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
510 additions
and
1 deletion
+510
-1
docs/models/supported_models.md
docs/models/supported_models.md
+2
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+24
-0
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+37
-0
tests/models/registry.py
tests/models/registry.py
+1
-0
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere2_vision.py
+445
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-0
No files found.
docs/models/supported_models.md
View file @
9f909b89
...
@@ -331,7 +331,7 @@ th {
...
@@ -331,7 +331,7 @@ th {
|
`BloomForCausalLM`
| BLOOM, BLOOMZ, BLOOMChat |
`bigscience/bloom`
,
`bigscience/bloomz`
, etc. | | ✅︎ | |
|
`BloomForCausalLM`
| BLOOM, BLOOMZ, BLOOMChat |
`bigscience/bloom`
,
`bigscience/bloomz`
, etc. | | ✅︎ | |
|
`BartForConditionalGeneration`
| BART |
`facebook/bart-base`
,
`facebook/bart-large-cnn`
, etc. | | | |
|
`BartForConditionalGeneration`
| BART |
`facebook/bart-base`
,
`facebook/bart-large-cnn`
, etc. | | | |
|
`ChatGLMModel`
,
`ChatGLMForConditionalGeneration`
| ChatGLM |
`zai-org/chatglm2-6b`
,
`zai-org/chatglm3-6b`
,
`ShieldLM-6B-chatglm3`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`ChatGLMModel`
,
`ChatGLMForConditionalGeneration`
| ChatGLM |
`zai-org/chatglm2-6b`
,
`zai-org/chatglm3-6b`
,
`ShieldLM-6B-chatglm3`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`CohereForCausalLM`
,
`Cohere2ForCausalLM`
| Command-R |
`Cohere
ForAI
/c4ai-command-r-v01`
,
`Cohere
ForAI
/c4ai-command-r7b-12-2024`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`CohereForCausalLM`
,
`Cohere2ForCausalLM`
| Command-R |
`Cohere
Labs
/c4ai-command-r-v01`
,
`Cohere
Labs
/c4ai-command-r7b-12-2024`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`DbrxForCausalLM`
| DBRX |
`databricks/dbrx-base`
,
`databricks/dbrx-instruct`
, etc. | | ✅︎ | ✅︎ |
|
`DbrxForCausalLM`
| DBRX |
`databricks/dbrx-base`
,
`databricks/dbrx-instruct`
, etc. | | ✅︎ | ✅︎ |
|
`DeciLMForCausalLM`
| DeciLM |
`nvidia/Llama-3_3-Nemotron-Super-49B-v1`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`DeciLMForCausalLM`
| DeciLM |
`nvidia/Llama-3_3-Nemotron-Super-49B-v1`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`DeepseekForCausalLM`
| DeepSeek |
`deepseek-ai/deepseek-llm-67b-base`
,
`deepseek-ai/deepseek-llm-7b-chat`
, etc. | | ✅︎ | ✅︎ |
|
`DeepseekForCausalLM`
| DeepSeek |
`deepseek-ai/deepseek-llm-67b-base`
,
`deepseek-ai/deepseek-llm-7b-chat`
, etc. | | ✅︎ | ✅︎ |
...
@@ -601,6 +601,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
...
@@ -601,6 +601,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereForAI/aya-vision-8b`
,
`CohereForAI/aya-vision-32b`
, etc. | | ✅︎ | ✅︎ |
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereForAI/aya-vision-8b`
,
`CohereForAI/aya-vision-32b`
, etc. | | ✅︎ | ✅︎ |
|
`Blip2ForConditionalGeneration`
| BLIP-2 | T + I
<sup>
E
</sup>
|
`Salesforce/blip2-opt-2.7b`
,
`Salesforce/blip2-opt-6.7b`
, etc. | | ✅︎ | ✅︎ |
|
`Blip2ForConditionalGeneration`
| BLIP-2 | T + I
<sup>
E
</sup>
|
`Salesforce/blip2-opt-2.7b`
,
`Salesforce/blip2-opt-6.7b`
, etc. | | ✅︎ | ✅︎ |
|
`ChameleonForConditionalGeneration`
| Chameleon | T + I |
`facebook/chameleon-7b`
, etc. | | ✅︎ | ✅︎ |
|
`ChameleonForConditionalGeneration`
| Chameleon | T + I |
`facebook/chameleon-7b`
, etc. | | ✅︎ | ✅︎ |
|
`Cohere2VisionForConditionalGeneration`
| Command A Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/command-a-vision-07-2025`
, etc. | | ✅︎ | ✅︎ |
|
`DeepseekVLV2ForCausalLM`
<sup>
^
</sup>
| DeepSeek-VL2 | T + I
<sup>
+
</sup>
|
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
, etc. | | ✅︎ | ✅︎ |
|
`DeepseekVLV2ForCausalLM`
<sup>
^
</sup>
| DeepSeek-VL2 | T + I
<sup>
+
</sup>
|
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
, etc. | | ✅︎ | ✅︎ |
|
`Florence2ForConditionalGeneration`
| Florence-2 | T + I |
`microsoft/Florence-2-base`
,
`microsoft/Florence-2-large`
, etc. | | | |
|
`Florence2ForConditionalGeneration`
| Florence-2 | T + I |
`microsoft/Florence-2-base`
,
`microsoft/Florence-2-large`
, etc. | | | |
|
`FuyuForCausalLM`
| Fuyu | T + I |
`adept/fuyu-8b`
, etc. | | ✅︎ | ✅︎ |
|
`FuyuForCausalLM`
| Fuyu | T + I |
`adept/fuyu-8b`
, etc. | | ✅︎ | ✅︎ |
...
...
examples/offline_inference/vision_language.py
View file @
9f909b89
...
@@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
...
@@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
)
)
def
run_command_a_vision
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"CohereLabs/command-a-vision-07-2025"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
,
tensor_parallel_size
=
4
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>
{
question
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Deepseek-VL2
# Deepseek-VL2
def
run_deepseek_vl2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
def
run_deepseek_vl2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
...
@@ -1417,6 +1440,7 @@ model_example_map = {
...
@@ -1417,6 +1440,7 @@ model_example_map = {
"aya_vision"
:
run_aya_vision
,
"aya_vision"
:
run_aya_vision
,
"blip-2"
:
run_blip2
,
"blip-2"
:
run_blip2
,
"chameleon"
:
run_chameleon
,
"chameleon"
:
run_chameleon
,
"command_a_vision"
:
run_command_a_vision
,
"deepseek_vl_v2"
:
run_deepseek_vl2
,
"deepseek_vl_v2"
:
run_deepseek_vl2
,
"florence2"
:
run_florence2
,
"florence2"
:
run_florence2
,
"fuyu"
:
run_fuyu
,
"fuyu"
:
run_fuyu
,
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
9f909b89
...
@@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
...
@@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
)
)
def
load_command_a_vision
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"CohereLabs/command-a-vision-07-2025"
# NOTE: This model is 122B parameters and requires tensor parallelism
# Recommended to use tp=4 on H100 GPUs
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
,
tensor_parallel_size
=
4
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
*
placeholders
,
{
"type"
:
"text"
,
"text"
:
question
},
],
}
]
processor
=
AutoProcessor
.
from_pretrained
(
model_name
)
prompt
=
processor
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
...
@@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
...
@@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
model_example_map
=
{
model_example_map
=
{
"aria"
:
load_aria
,
"aria"
:
load_aria
,
"aya_vision"
:
load_aya_vision
,
"aya_vision"
:
load_aya_vision
,
"command_a_vision"
:
load_command_a_vision
,
"deepseek_vl_v2"
:
load_deepseek_vl2
,
"deepseek_vl_v2"
:
load_deepseek_vl2
,
"gemma3"
:
load_gemma3
,
"gemma3"
:
load_gemma3
,
"h2ovl_chat"
:
load_h2ovl
,
"h2ovl_chat"
:
load_h2ovl
,
...
...
tests/models/registry.py
View file @
9f909b89
...
@@ -383,6 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -383,6 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
}),
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
}),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"Cohere2VisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/command-a-vision-07-2025"
),
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
...
...
vllm/model_executor/models/cohere2_vision.py
0 → 100644
View file @
9f909b89
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from vllm/model_executor/models/aya_vision.py
"""Command-A-Vision (Cohere2Vision) multimodal model implementation for vLLM."""
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Literal
,
Optional
,
Union
import
torch
from
torch
import
nn
from
transformers
import
BatchFeature
,
PretrainedConfig
from
transformers.models.cohere2_vision
import
Cohere2VisionConfig
from
transformers.models.cohere2_vision.processing_cohere2_vision
import
(
Cohere2VisionProcessor
)
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.activation
import
MulAndSilu
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalKwargs
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
MultiModalFieldConfig
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
class
Cohere2VisionImagePixelInputs
(
TensorSchema
):
"""
Dimensions:
- np: The total number of patches over each image over each prompt in
the batch
- c: Number of channels
- h: Height of each image patch
- w: Width of each image patch
- bn: Batch size * number of images
"""
type
:
Literal
[
"pixel_values"
]
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"np"
,
3
,
"h"
,
"w"
),
]
num_patches
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
),
]
class
Cohere2VisionMultiModalProjector
(
nn
.
Module
):
"""Multimodal projector that maps vision features to text embedding space.
Uses pixel shuffle downsampling followed by SwiGLU activation.
"""
def
__init__
(
self
,
config
:
Cohere2VisionConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
self
.
downsample_factor
=
config
.
downsample_factor
# Input dimension after pixel shuffle downsampling
input_dim
=
config
.
vision_config
.
hidden_size
*
(
config
.
downsample_factor
**
2
)
# MergedColumnParallelLinear expects the intermediate size to be a list
# of sizes, so that it will load the weights as two separate linear
# layers before applying any parallelism.
# We need to divide the alignment intermediate size by 2 because
# the weights are merged weights of two linear layers for SwiGLU.
self
.
intermediate_size
=
config
.
alignment_intermediate_size
//
2
self
.
linear_1
=
MergedColumnParallelLinear
(
input_dim
,
[
self
.
intermediate_size
]
*
2
,
bias
=
True
,
return_bias
=
False
,
prefix
=
f
"
{
prefix
}
.linear_1"
,
)
self
.
act
=
MulAndSilu
()
self
.
linear_2
=
RowParallelLinear
(
self
.
intermediate_size
,
config
.
text_config
.
hidden_size
,
bias
=
True
,
return_bias
=
False
,
prefix
=
f
"
{
prefix
}
.linear_2"
,
)
def
forward
(
self
,
image_features
):
image_features
=
self
.
pixel_shuffle
(
image_features
)
hidden_states
=
self
.
linear_1
(
image_features
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
=
self
.
linear_2
(
hidden_states
)
return
hidden_states
def
pixel_shuffle
(
self
,
image_features
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Apply pixel shuffle downsampling to reduce spatial dimensions.
Args:
image_features: Input tensor of shape [B, S, D] where S = H*W
Returns:
Downsampled tensor with increased channel dimension
"""
height
=
width
=
int
(
image_features
.
shape
[
1
]
**
0.5
)
x
=
image_features
.
reshape
(
image_features
.
shape
[
0
],
width
,
height
,
-
1
)
n
,
h
,
w
,
c
=
x
.
size
()
scale_factor
=
1.
/
self
.
downsample_factor
nh
=
int
(
h
*
scale_factor
)
nw
=
int
(
w
*
scale_factor
)
x
=
x
.
reshape
(
n
,
nh
,
self
.
downsample_factor
,
nw
,
self
.
downsample_factor
,
c
)
x
=
x
.
permute
(
0
,
1
,
3
,
2
,
4
,
5
).
contiguous
()
x
=
x
.
reshape
(
n
,
nh
,
nw
,
-
1
)
return
x
class
Cohere2VisionProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
)
->
Cohere2VisionConfig
:
return
self
.
ctx
.
get_hf_config
(
Cohere2VisionConfig
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
Cohere2VisionProcessor
:
return
self
.
ctx
.
get_hf_processor
(
Cohere2VisionProcessor
,
**
kwargs
)
def
get_image_processor
(
self
,
**
kwargs
:
object
):
return
self
.
get_hf_processor
(
**
kwargs
).
image_processor
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
image_processor
=
self
.
get_image_processor
()
height
=
image_processor
.
size
[
'height'
]
width
=
image_processor
.
size
[
'width'
]
max_patches
=
image_processor
.
max_patches
return
ImageSize
(
height
=
height
*
max_patches
,
width
=
width
)
def
get_num_patches
(
self
,
image_width
:
int
,
image_height
:
int
)
->
int
:
"""
Calculate the number of image patches for a given image.
Uses the HF processor to determine the actual number of patches.
"""
return
self
.
get_hf_processor
(
).
image_processor
.
get_number_of_image_patches
(
image_height
,
image_width
,
{})
class
Cohere2VisionDummyInputsBuilder
(
BaseDummyInputsBuilder
[
Cohere2VisionProcessingInfo
]):
def
get_dummy_text
(
self
,
mm_counts
:
Mapping
[
str
,
int
])
->
str
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
processor
=
self
.
info
.
get_hf_processor
()
image_token
=
processor
.
image_token
return
image_token
*
num_images
def
get_dummy_mm_data
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
image_size
=
\
self
.
info
.
get_image_size_with_most_features
()
return
{
"image"
:
self
.
_get_dummy_images
(
width
=
image_size
.
width
,
height
=
image_size
.
height
,
num_images
=
num_images
)
}
class
Cohere2VisionMultiModalProcessor
(
BaseMultiModalProcessor
[
Cohere2VisionProcessingInfo
]):
def
_call_hf_processor
(
self
,
prompt
:
str
,
mm_data
:
Mapping
[
str
,
object
],
mm_kwargs
:
Mapping
[
str
,
object
],
tok_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
processed_outputs
=
super
().
_call_hf_processor
(
prompt
,
mm_data
,
mm_kwargs
,
tok_kwargs
,
)
# Ensure num_patches is available for proper tensor splitting
if
"num_patches"
not
in
processed_outputs
and
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
# Fallback calculation if HF processor didn't provide num_patches
parsed_images
=
self
.
_get_data_parser
().
parse_mm_data
({
"image"
:
images
}).
get_items
(
"image"
,
ImageProcessorItems
)
num_patches
=
[
self
.
info
.
get_num_patches
(
image_width
=
parsed_images
.
get_image_size
(
i
).
width
,
image_height
=
parsed_images
.
get_image_size
(
i
).
height
)
for
i
in
range
(
len
(
parsed_images
))
]
processed_outputs
[
"num_patches"
]
=
torch
.
tensor
(
num_patches
)
return
processed_outputs
def
_get_mm_fields_config
(
self
,
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
num_patches
=
hf_inputs
.
get
(
"num_patches"
,
torch
.
empty
(
0
))
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
flat_from_sizes
(
"image"
,
num_patches
),
num_patches
=
MultiModalFieldConfig
.
batched
(
"image"
),
image_embeds
=
MultiModalFieldConfig
.
batched
(
"image"
),
)
def
_get_prompt_updates
(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
out_mm_kwargs
:
MultiModalKwargs
,
)
->
Sequence
[
PromptUpdate
]:
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
image_token
=
hf_processor
.
image_token
img_line_break_token
=
hf_processor
.
img_line_break_token
boi_token
=
hf_processor
.
boi_token
eoi_token
=
hf_processor
.
eoi_token
def
get_replacement
(
item_idx
:
int
):
images
:
ImageProcessorItems
=
mm_items
.
get
(
"image"
,
ImageProcessorItems
)
image_size
:
ImageSize
=
images
.
get_image_size
(
item_idx
)
num_patches
=
self
.
info
.
get_num_patches
(
image_size
.
height
,
image_size
.
width
)
img_tokens_per_tile
=
int
(
hf_processor
.
patch_size
**
2
)
single_tile_tokens
=
image_token
*
img_tokens_per_tile
+
\
img_line_break_token
img_string
=
f
"
{
boi_token
}
\
{
single_tile_tokens
*
num_patches
}
\
{
eoi_token
}
"
return
PromptUpdateDetails
.
select_text
(
img_string
,
image_token
)
return
[
PromptReplacement
(
modality
=
"image"
,
target
=
image_token
,
replacement
=
get_replacement
,
)
]
@
MULTIMODAL_REGISTRY
.
register_processor
(
Cohere2VisionMultiModalProcessor
,
info
=
Cohere2VisionProcessingInfo
,
dummy_inputs
=
Cohere2VisionDummyInputsBuilder
)
class
Cohere2VisionForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model.vision_tower."
:
"vision_tower."
,
"model.multi_modal_projector."
:
"multi_modal_projector."
,
"model.language_model."
:
"language_model.model."
,
"lm_head."
:
"language_model.lm_head."
,
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
:
Cohere2VisionConfig
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
multimodal_config
=
multimodal_config
self
.
_patch_quant_config
(
config
,
quant_config
)
self
.
vision_tower
=
SiglipVisionModel
(
config
.
vision_config
,
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"vision_tower"
))
self
.
vocab_size
=
config
.
text_config
.
vocab_size
self
.
multi_modal_projector
=
\
Cohere2VisionMultiModalProjector
(
config
,
prefix
=
maybe_prefix
(
prefix
,
"multi_modal_projector"
))
self
.
language_model
=
init_vllm_registered_model
(
vllm_config
=
vllm_config
,
hf_config
=
config
.
text_config
,
prefix
=
maybe_prefix
(
prefix
,
"language_model"
),
architectures
=
[
"Cohere2ForCausalLM"
])
@
property
def
dtype
(
self
):
return
next
(
self
.
parameters
()).
dtype
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
def
_process_image_input
(
self
,
image_input
:
Cohere2VisionImagePixelInputs
,
**
kwargs
)
->
list
[
torch
.
Tensor
]:
"""Process image pixels through vision tower and projector.
Args:
image_input: Validated image input containing pixel values and
patch counts
Returns:
List of flattened image embeddings, one per image
"""
assert
self
.
vision_tower
is
not
None
,
"Vision tower is required"
pixel_values
=
image_input
[
"pixel_values"
]
num_patches
=
image_input
[
"num_patches"
]
# Extract visual features
image_features
=
self
.
vision_tower
(
pixel_values
)
# Project to text embedding space
image_embeds
=
self
.
multi_modal_projector
(
image_features
)
# Split and flatten embeddings per image
return
[
e
.
flatten
(
0
,
2
)
for
e
in
image_embeds
.
split
(
num_patches
.
tolist
())
]
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
Cohere2VisionImagePixelInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
num_patches
=
kwargs
.
pop
(
"num_patches"
,
None
)
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
assert
image_embeds
is
None
,
\
"Cohere2Vision does not support image_embeds."
if
pixel_values
is
None
:
return
None
return
Cohere2VisionImagePixelInputs
(
type
=
"pixel_values"
,
pixel_values
=
flatten_bn
(
pixel_values
,
concat
=
True
),
num_patches
=
flatten_bn
(
num_patches
,
concat
=
True
),
resolve_bindings
=
{
"h"
:
self
.
config
.
vision_config
.
image_size
,
"w"
:
self
.
config
.
vision_config
.
image_size
,
})
def
_patch_quant_config
(
self
,
config
:
PretrainedConfig
,
quant_config
:
QuantizationConfig
):
# the awq models from OpenGVLab missing `modules_to_not_convert`
# patch the quant_config to add `modules_to_not_convert` back
if
isinstance
(
quant_config
,
AWQConfig
):
text_config
=
config
.
text_config
llm_quant_config
=
getattr
(
text_config
,
"quantization_config"
,
None
)
if
(
not
quant_config
.
modules_to_not_convert
)
and
(
llm_quant_config
is
not
None
):
quant_config
.
modules_to_not_convert
.
append
(
"vision_tower"
)
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
[]
return
self
.
_process_image_input
(
image_input
,
**
kwargs
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
multimodal_embeddings
=
multimodal_embeddings
,
placeholder_token_id
=
self
.
config
.
image_token_id
,
)
return
inputs_embeds
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
input_ids
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
=
input_ids
,
positions
=
positions
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
vllm/model_executor/models/registry.py
View file @
9f909b89
...
@@ -201,6 +201,7 @@ _MULTIMODAL_MODELS = {
...
@@ -201,6 +201,7 @@ _MULTIMODAL_MODELS = {
"AyaVisionForConditionalGeneration"
:
(
"aya_vision"
,
"AyaVisionForConditionalGeneration"
),
# noqa: E501
"AyaVisionForConditionalGeneration"
:
(
"aya_vision"
,
"AyaVisionForConditionalGeneration"
),
# noqa: E501
"Blip2ForConditionalGeneration"
:
(
"blip2"
,
"Blip2ForConditionalGeneration"
),
"Blip2ForConditionalGeneration"
:
(
"blip2"
,
"Blip2ForConditionalGeneration"
),
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
# noqa: E501
"Cohere2VisionForConditionalGeneration"
:
(
"cohere2_vision"
,
"Cohere2VisionForConditionalGeneration"
),
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
(
"deepseek_vl2"
,
"DeepseekVLV2ForCausalLM"
),
"DeepseekVLV2ForCausalLM"
:
(
"deepseek_vl2"
,
"DeepseekVLV2ForCausalLM"
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"Gemma3ForConditionalGeneration"
:
(
"gemma3_mm"
,
"Gemma3ForConditionalGeneration"
),
# noqa: E501
"Gemma3ForConditionalGeneration"
:
(
"gemma3_mm"
,
"Gemma3ForConditionalGeneration"
),
# noqa: E501
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment