Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a2ad15c0
Unverified
Commit
a2ad15c0
authored
Jan 05, 2026
by
Qiping Pan
Committed by
GitHub
Jan 05, 2026
Browse files
[Model] Enable LoRA support for BLIP2 (#31620)
Signed-off-by:
Qiping Pan
<
panqiping@outlook.com
>
parent
3133c192
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
47 additions
and
4 deletions
+47
-4
docs/models/supported_models.md
docs/models/supported_models.md
+1
-1
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+46
-3
No files found.
docs/models/supported_models.md
View file @
a2ad15c0
...
@@ -673,7 +673,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
...
@@ -673,7 +673,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/aya-vision-8b`
,
`CohereLabs/aya-vision-32b`
, etc. | | ✅︎ |
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/aya-vision-8b`
,
`CohereLabs/aya-vision-32b`
, etc. | | ✅︎ |
|
`BagelForConditionalGeneration`
| BAGEL | T + I
<sup>
+
</sup>
|
`ByteDance-Seed/BAGEL-7B-MoT`
| ✅︎ | ✅︎ |
|
`BagelForConditionalGeneration`
| BAGEL | T + I
<sup>
+
</sup>
|
`ByteDance-Seed/BAGEL-7B-MoT`
| ✅︎ | ✅︎ |
|
`BeeForConditionalGeneration`
| Bee-8B | T + I
<sup>
E+
</sup>
|
`Open-Bee/Bee-8B-RL`
,
`Open-Bee/Bee-8B-SFT`
| | ✅︎ |
|
`BeeForConditionalGeneration`
| Bee-8B | T + I
<sup>
E+
</sup>
|
`Open-Bee/Bee-8B-RL`
,
`Open-Bee/Bee-8B-SFT`
| | ✅︎ |
|
`Blip2ForConditionalGeneration`
| BLIP-2 | T + I
<sup>
E
</sup>
|
`Salesforce/blip2-opt-2.7b`
,
`Salesforce/blip2-opt-6.7b`
, etc. | | ✅︎ |
|
`Blip2ForConditionalGeneration`
| BLIP-2 | T + I
<sup>
E
</sup>
|
`Salesforce/blip2-opt-2.7b`
,
`Salesforce/blip2-opt-6.7b`
, etc. |
✅︎
| ✅︎ |
|
`ChameleonForConditionalGeneration`
| Chameleon | T + I |
`facebook/chameleon-7b`
, etc. | | ✅︎ |
|
`ChameleonForConditionalGeneration`
| Chameleon | T + I |
`facebook/chameleon-7b`
, etc. | | ✅︎ |
|
`Cohere2VisionForConditionalGeneration`
| Command A Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/command-a-vision-07-2025`
, etc. | | ✅︎ |
|
`Cohere2VisionForConditionalGeneration`
| Command A Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/command-a-vision-07-2025`
, etc. | | ✅︎ |
|
`DeepseekVLV2ForCausalLM`
<sup>
^
</sup>
| DeepSeek-VL2 | T + I
<sup>
+
</sup>
|
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
, etc. | | ✅︎ |
|
`DeepseekVLV2ForCausalLM`
<sup>
^
</sup>
| DeepSeek-VL2 | T + I
<sup>
+
</sup>
|
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
, etc. | | ✅︎ |
...
...
vllm/model_executor/models/blip2.py
View file @
a2ad15c0
...
@@ -35,13 +35,15 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
...
@@ -35,13 +35,15 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.blip
import
BlipVisionModel
from
.blip
import
BlipVisionModel
,
get_blip_num_patches
from
.interfaces
import
(
from
.interfaces
import
(
MultiModalEmbeddings
,
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsMultiModal
,
SupportsPP
,
SupportsPP
,
SupportsQuant
,
SupportsQuant
,
)
)
from
.module_mapping
import
MultiModelKeys
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
...
@@ -521,7 +523,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
...
@@ -521,7 +523,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
dummy_inputs
=
Blip2DummyInputsBuilder
,
dummy_inputs
=
Blip2DummyInputsBuilder
,
)
)
class
Blip2ForConditionalGeneration
(
class
Blip2ForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
nn
.
Module
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
):
):
@
classmethod
@
classmethod
def
get_placeholder_str
(
cls
,
modality
:
str
,
i
:
int
)
->
str
|
None
:
def
get_placeholder_str
(
cls
,
modality
:
str
,
i
:
int
)
->
str
|
None
:
...
@@ -538,9 +540,17 @@ class Blip2ForConditionalGeneration(
...
@@ -538,9 +540,17 @@ class Blip2ForConditionalGeneration(
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
multimodal_config
vision_config
=
config
.
vision_config
self
.
_vision_tokens_per_image
=
(
get_blip_num_patches
(
image_size
=
vision_config
.
image_size
,
patch_size
=
vision_config
.
patch_size
,
)
+
1
# include class token
)
# TODO: Optionally initializes this for supporting embeddings.
# TODO: Optionally initializes this for supporting embeddings.
self
.
vision_model
=
BlipVisionModel
(
config
.
vision_config
,
quant_config
)
self
.
vision_model
=
BlipVisionModel
(
vision_config
,
quant_config
)
self
.
query_tokens
=
nn
.
Parameter
(
self
.
query_tokens
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
config
.
num_query_tokens
,
config
.
qformer_config
.
hidden_size
)
torch
.
zeros
(
1
,
config
.
num_query_tokens
,
config
.
qformer_config
.
hidden_size
)
...
@@ -691,3 +701,36 @@ class Blip2ForConditionalGeneration(
...
@@ -691,3 +701,36 @@ class Blip2ForConditionalGeneration(
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
return
MultiModelKeys
.
from_string_field
(
language_model
=
"language_model"
,
connector
=
[
"qformer"
,
"language_projection"
],
tower_model
=
"vision_model"
,
)
def
get_num_mm_encoder_tokens
(
self
,
num_image_tokens
:
int
,
)
->
int
:
if
num_image_tokens
<=
0
:
return
0
assert
num_image_tokens
%
self
.
config
.
num_query_tokens
==
0
,
(
"The number of image tokens must be a multiple of "
"the number of query tokens."
)
num_images
=
num_image_tokens
/
self
.
config
.
num_query_tokens
return
num_images
*
self
.
_vision_tokens_per_image
def
get_num_mm_connector_tokens
(
self
,
num_vision_tokens
:
int
,
)
->
int
:
if
num_vision_tokens
<=
0
:
return
0
assert
num_vision_tokens
%
self
.
_vision_tokens_per_image
==
0
,
(
"The number of vision tokens must be a multiple of "
"the number of tokens per image."
)
num_images
=
num_vision_tokens
/
self
.
_vision_tokens_per_image
return
num_images
*
self
.
config
.
num_query_tokens
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment