Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1405f0c7
Unverified
Commit
1405f0c7
authored
Oct 01, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 01, 2025
Browse files
[Misc] Factor out common `_apply_feature_select_strategy` (#26003)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
84d57342
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
40 additions
and
39 deletions
+40
-39
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+3
-16
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+3
-2
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+6
-17
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+28
-4
No files found.
vllm/model_executor/models/llava.py
View file @
1405f0c7
...
@@ -41,7 +41,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
...
@@ -41,7 +41,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
from
.siglip
import
SiglipVisionModel
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
)
init_vllm_registered_model
,
maybe_prefix
)
from
.vision
import
get_vision_encoder_info
from
.vision
import
get_num_selected_vision_tokens
,
get_vision_encoder_info
class
LlavaImagePixelInputs
(
TensorSchema
):
class
LlavaImagePixelInputs
(
TensorSchema
):
...
@@ -147,19 +147,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
...
@@ -147,19 +147,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
return
{
"image"
:
None
}
def
_apply_feature_select_strategy
(
self
,
strategy
:
str
,
encoder_num_image_tokens
:
int
,
)
->
int
:
if
strategy
==
"default"
:
return
encoder_num_image_tokens
-
1
if
strategy
==
"full"
:
return
encoder_num_image_tokens
msg
=
f
"Unexpected feature select strategy:
{
strategy
!
r
}
"
raise
NotImplementedError
(
msg
)
def
get_num_image_tokens
(
def
get_num_image_tokens
(
self
,
self
,
*
,
*
,
...
@@ -169,12 +156,12 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
...
@@ -169,12 +156,12 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
hf_config
=
self
.
get_hf_config
()
hf_config
=
self
.
get_hf_config
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
return
self
.
_apply_feature_select_strategy
(
return
get_num_selected_vision_tokens
(
hf_config
.
vision_feature_select_strategy
,
vision_encoder_info
.
get_num_image_tokens
(
vision_encoder_info
.
get_num_image_tokens
(
image_width
=
image_width
,
image_width
=
image_width
,
image_height
=
image_height
,
image_height
=
image_height
,
),
),
hf_config
.
vision_feature_select_strategy
,
)
)
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
...
...
vllm/model_executor/models/llava_next.py
View file @
1405f0c7
...
@@ -27,6 +27,7 @@ from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
...
@@ -27,6 +27,7 @@ from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
from
.siglip
import
SiglipVisionModel
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
)
init_vllm_registered_model
,
maybe_prefix
)
from
.vision
import
get_num_selected_vision_tokens
class
LlavaNextImagePixelInputs
(
TensorSchema
):
class
LlavaNextImagePixelInputs
(
TensorSchema
):
...
@@ -95,12 +96,12 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
...
@@ -95,12 +96,12 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
hf_config
=
self
.
get_hf_config
()
hf_config
=
self
.
get_hf_config
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
base_feature_size
=
self
.
_apply_feature_select_strategy
(
base_feature_size
=
get_num_selected_vision_tokens
(
hf_config
.
vision_feature_select_strategy
,
vision_encoder_info
.
get_num_image_tokens
(
vision_encoder_info
.
get_num_image_tokens
(
image_width
=
image_width
,
image_width
=
image_width
,
image_height
=
image_height
,
image_height
=
image_height
,
),
),
hf_config
.
vision_feature_select_strategy
,
)
)
num_patch_height
,
num_patch_width
=
get_anyres_image_grid_shape
(
num_patch_height
,
num_patch_width
=
get_anyres_image_grid_shape
(
...
...
vllm/model_executor/models/tarsier.py
View file @
1405f0c7
...
@@ -40,7 +40,8 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
...
@@ -40,7 +40,8 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from
.siglip
import
SiglipVisionModel
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
)
maybe_prefix
)
from
.vision
import
VisionEncoderInfo
,
get_vision_encoder_info
from
.vision
import
(
VisionEncoderInfo
,
get_num_selected_vision_tokens
,
get_vision_encoder_info
)
class
TarsierImagePixelInputs
(
TensorSchema
):
class
TarsierImagePixelInputs
(
TensorSchema
):
...
@@ -201,18 +202,6 @@ class TarsierProcessingInfo(BaseProcessingInfo):
...
@@ -201,18 +202,6 @@ class TarsierProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
return
{
"image"
:
None
}
def
_apply_feature_select_strategy
(
self
,
strategy
:
str
,
encoder_num_image_tokens
:
int
,
)
->
int
:
if
strategy
==
"default"
:
return
encoder_num_image_tokens
-
1
if
strategy
==
"full"
:
return
encoder_num_image_tokens
msg
=
f
"Unexpected feature select strategy:
{
strategy
!
r
}
"
raise
NotImplementedError
(
msg
)
def
get_num_image_tokens
(
def
get_num_image_tokens
(
self
,
self
,
*
,
*
,
...
@@ -221,21 +210,21 @@ class TarsierProcessingInfo(BaseProcessingInfo):
...
@@ -221,21 +210,21 @@ class TarsierProcessingInfo(BaseProcessingInfo):
)
->
int
:
)
->
int
:
hf_config
=
self
.
get_hf_config
()
hf_config
=
self
.
get_hf_config
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
vision_encoder_info
=
self
.
get_vision_encoder_info
()
num_projected_patches
=
self
.
_apply_feature_select_strategy
(
num_projected_patches
=
get_num_selected_vision_tokens
(
hf_config
.
vision_feature_select_strategy
,
vision_encoder_info
.
get_num_image_tokens
(
vision_encoder_info
.
get_num_image_tokens
(
image_width
=
image_width
,
image_width
=
image_width
,
image_height
=
image_height
,
image_height
=
image_height
,
),
),
hf_config
.
vision_feature_select_strategy
,
)
)
if
num_projected_patches
<=
0
:
if
num_projected_patches
<=
0
:
default_size
=
self
.
get_image_size_with_most_features
()
default_size
=
self
.
get_image_size_with_most_features
()
num_projected_patches_default
=
self
.
_apply_feature_select_strategy
(
num_projected_patches_default
=
get_num_selected_vision_tokens
(
hf_config
.
vision_feature_select_strategy
,
vision_encoder_info
.
get_num_image_tokens
(
vision_encoder_info
.
get_num_image_tokens
(
image_width
=
default_size
.
width
,
image_width
=
default_size
.
width
,
image_height
=
default_size
.
height
,
image_height
=
default_size
.
height
,
),
),
hf_config
.
vision_feature_select_strategy
,
)
)
if
num_projected_patches_default
<=
0
:
if
num_projected_patches_default
<=
0
:
raise
ValueError
(
raise
ValueError
(
...
...
vllm/model_executor/models/vision.py
View file @
1405f0c7
...
@@ -9,7 +9,6 @@ from typing import (Callable, Final, Generic, Literal, Optional, Protocol,
...
@@ -9,7 +9,6 @@ from typing import (Callable, Final, Generic, Literal, Optional, Protocol,
import
torch
import
torch
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
typing_extensions
import
assert_never
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
...
@@ -22,9 +21,13 @@ logger = init_logger(__name__)
...
@@ -22,9 +21,13 @@ logger = init_logger(__name__)
_C
=
TypeVar
(
"_C"
,
bound
=
PretrainedConfig
)
_C
=
TypeVar
(
"_C"
,
bound
=
PretrainedConfig
)
class
_RootConfig
(
Protocol
[
_C
]):
vision_config
:
_C
class
VisionEncoderInfo
(
ABC
,
Generic
[
_C
]):
class
VisionEncoderInfo
(
ABC
,
Generic
[
_C
]):
def
__init__
(
self
,
hf_config
:
_
C
)
->
None
:
def
__init__
(
self
,
hf_config
:
_
RootConfig
[
_C
]
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
hf_config
=
hf_config
self
.
hf_config
=
hf_config
...
@@ -95,7 +98,7 @@ VisionFeatureSelectStrategy = Union[
...
@@ -95,7 +98,7 @@ VisionFeatureSelectStrategy = Union[
def
_get_vision_feature_selector
(
def
_get_vision_feature_selector
(
strategy
:
VisionFeatureSelectStrategy
,
strategy
:
Union
[
VisionFeatureSelectStrategy
,
str
],
)
->
Callable
[[
torch
.
Tensor
],
torch
.
Tensor
]:
)
->
Callable
[[
torch
.
Tensor
],
torch
.
Tensor
]:
if
callable
(
strategy
):
if
callable
(
strategy
):
return
strategy
return
strategy
...
@@ -111,7 +114,28 @@ def _get_vision_feature_selector(
...
@@ -111,7 +114,28 @@ def _get_vision_feature_selector(
if
strategy
==
"full"
:
if
strategy
==
"full"
:
return
lambda
feats
:
feats
return
lambda
feats
:
feats
assert_never
(
strategy
)
raise
ValueError
(
f
"Unexpected feature select strategy:
{
strategy
!
r
}
"
)
def
get_num_selected_vision_tokens
(
num_vision_tokens
:
int
,
strategy
:
Union
[
VisionFeatureSelectStrategy
,
str
],
)
->
int
:
if
callable
(
strategy
):
dummy_features
=
torch
.
empty
(
1
,
num_vision_tokens
,
64
)
# [B, L, D]
dummy_selected_features
=
strategy
(
dummy_features
)
return
dummy_selected_features
.
shape
[
1
]
if
strategy
==
"class"
:
return
1
if
strategy
==
"default"
:
return
num_vision_tokens
-
1
if
strategy
==
"full"
:
return
num_vision_tokens
raise
ValueError
(
f
"Unexpected feature select strategy:
{
strategy
!
r
}
"
)
def
resolve_visual_encoder_outputs
(
def
resolve_visual_encoder_outputs
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment