Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
df1a2113
Unverified
Commit
df1a2113
authored
Aug 21, 2024
by
zifeitong
Committed by
GitHub
Aug 22, 2024
Browse files
[Model] Fix Phi-3.5-vision-instruct 'num_crops' issue (#7710)
parent
7937009a
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
37 additions
and
13 deletions
+37
-13
docs/source/models/supported_models.rst
docs/source/models/supported_models.rst
+2
-2
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+1
-1
vllm/config.py
vllm/config.py
+5
-1
vllm/inputs/registry.py
vllm/inputs/registry.py
+9
-2
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+6
-6
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+14
-1
No files found.
docs/source/models/supported_models.rst
View file @
df1a2113
...
...
@@ -225,9 +225,9 @@ Multimodal Language Models
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
-
* - :code:`Phi3VForCausalLM`
- Phi-3-Vision
- Phi-3-Vision
, Phi-3.5-Vision
- Image
- :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
- :code:`microsoft/Phi-3-vision-128k-instruct`,
:code:`microsoft/Phi-3.5-vision-instruct`
etc.
-
* - :code:`MiniCPMV`
- MiniCPM-V
...
...
tests/models/test_phi3v.py
View file @
df1a2113
...
...
@@ -21,7 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
})
models
=
[
"microsoft/Phi-3-vision-
128k-
instruct"
]
models
=
[
"microsoft/Phi-3
.5
-vision-instruct"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
...
...
vllm/config.py
View file @
df1a2113
...
...
@@ -13,7 +13,9 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.platforms
import
current_platform
from
vllm.tracing
import
is_otel_available
,
otel_import_error_traceback
from
vllm.transformers_utils.config
import
get_config
,
get_hf_text_config
from
vllm.transformers_utils.config
import
(
get_config
,
get_hf_image_processor_config
,
get_hf_text_config
)
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
GiB_bytes
,
cuda_device_count_stateless
,
get_cpu_memory
,
is_cpu
,
is_hip
,
is_neuron
,
is_openvino
,
is_xpu
,
...
...
@@ -167,6 +169,8 @@ class ModelConfig:
self
.
hf_config
=
get_config
(
self
.
model
,
trust_remote_code
,
revision
,
code_revision
,
rope_scaling
,
rope_theta
)
self
.
hf_text_config
=
get_hf_text_config
(
self
.
hf_config
)
self
.
hf_image_processor_config
=
get_hf_image_processor_config
(
self
.
model
,
revision
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
# Choose a default enforce_eager value if the user did not specify
...
...
vllm/inputs/registry.py
View file @
df1a2113
...
...
@@ -2,8 +2,8 @@ import functools
from
array
import
array
from
collections
import
UserDict
from
dataclasses
import
dataclass
from
typing
import
(
TYPE_CHECKING
,
Callable
,
Dict
,
Mapping
,
Optional
,
Protocol
,
Tuple
,
Type
)
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
Mapping
,
Optional
,
Protocol
,
Tuple
,
Type
)
from
torch
import
nn
from
transformers
import
PretrainedConfig
...
...
@@ -55,6 +55,13 @@ class InputContext:
return
hf_config
def
get_hf_image_processor_config
(
self
)
->
Dict
[
str
,
Any
]:
"""
Get the HuggingFace image processor configuration of the model.
"""
return
self
.
model_config
.
hf_image_processor_config
N
=
TypeVar
(
"N"
,
bound
=
Type
[
nn
.
Module
])
...
...
vllm/model_executor/models/phi3v.py
View file @
df1a2113
...
...
@@ -15,8 +15,8 @@
# limitations under the License.
import
re
from
functools
import
lru_cache
from
typing
import
(
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Tuple
,
TypedDict
,
Union
)
from
typing
import
(
Any
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Tuple
,
TypedDict
,
Union
)
import
numpy
as
np
import
torch
...
...
@@ -324,12 +324,12 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
def
get_phi3v_image_feature_size
(
hf_config
:
PretrainedConfig
,
hf_config
:
Dict
[
str
,
Any
]
,
*
,
input_height
:
int
,
input_width
:
int
,
)
->
int
:
num_crops
=
getattr
(
hf_config
,
"num_crops"
,
16
)
num_crops
=
hf_config
.
get
(
"num_crops"
,
16
)
new_width
,
new_height
=
_calc_hd_transform_size
(
width
=
input_width
,
height
=
input_height
,
hd_num
=
num_crops
)
...
...
@@ -341,7 +341,7 @@ def get_phi3v_image_feature_size(
def
get_max_phi3v_image_tokens
(
ctx
:
InputContext
):
return
get_phi3v_image_feature_size
(
ctx
.
get_hf_config
(),
ctx
.
get_hf_
image_processor_
config
(),
input_height
=
MAX_IMAGE_FEATURE_SIZE_HEIGHT
,
input_width
=
MAX_IMAGE_FEATURE_SIZE_WIDTH
,
)
...
...
@@ -395,7 +395,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
return
llm_inputs
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
()
hf_config
=
ctx
.
get_hf_
image_processor_
config
()
image_data
=
multi_modal_data
[
"image"
]
if
isinstance
(
image_data
,
Image
.
Image
):
...
...
vllm/transformers_utils/config.py
View file @
df1a2113
import
contextlib
from
pathlib
import
Path
from
typing
import
Dict
,
Optional
,
Type
,
Union
from
typing
import
Any
,
Dict
,
Optional
,
Type
,
Union
from
transformers
import
GenerationConfig
,
PretrainedConfig
from
transformers.models.auto.image_processing_auto
import
(
get_image_processor_config
)
from
transformers.models.auto.modeling_auto
import
(
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
...
...
@@ -98,6 +100,17 @@ def get_config(
return
config
def
get_hf_image_processor_config
(
model
:
Union
[
str
,
Path
],
revision
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
Dict
[
str
,
Any
]:
# Separate model folder from file path for GGUF models
if
Path
(
model
).
is_file
()
and
Path
(
model
).
suffix
==
".gguf"
:
model
=
Path
(
model
).
parent
return
get_image_processor_config
(
model
,
revision
=
revision
,
**
kwargs
)
def
get_hf_text_config
(
config
:
PretrainedConfig
):
"""Get the "sub" config relevant to llm for multi modal models.
No op for pure text models.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment