Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
edd5fe5f
Unverified
Commit
edd5fe5f
authored
Jun 24, 2024
by
Isotr0py
Committed by
GitHub
Jun 24, 2024
Browse files
[Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772)
parent
5d4d9053
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
5 deletions
+72
-5
requirements-cpu.txt
requirements-cpu.txt
+1
-0
requirements-cuda.txt
requirements-cuda.txt
+2
-0
requirements-test.txt
requirements-test.txt
+0
-1
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+4
-0
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+65
-4
No files found.
requirements-cpu.txt
View file @
edd5fe5f
...
...
@@ -3,4 +3,5 @@
# Dependencies for x86_64 CPUs
torch == 2.3.1+cpu
torchvision == 0.18.1+cpu # required for the image processor of phi3v, this must be updated alongside torch
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
requirements-cuda.txt
View file @
edd5fe5f
...
...
@@ -5,5 +5,7 @@
ray >= 2.9
nvidia-ml-py # for pynvml package
torch == 2.3.0
# These must be updated alongside torch
torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0
requirements-test.txt
View file @
edd5fe5f
...
...
@@ -14,7 +14,6 @@ peft
requests
ray
sentence-transformers # required for embedding
torchvision # required for the image processor of phi3v
# Benchmarking
aiohttp
...
...
tests/models/test_phi3v.py
View file @
edd5fe5f
...
...
@@ -22,6 +22,7 @@ assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
def
iter_phi3v_configs
(
model_name
:
str
):
image_hw_to_feature_size
=
{
(
1008
,
1344
):
1921
,
(
2016
,
2688
):
1933
,
}
for
(
h
,
w
),
f
in
image_hw_to_feature_size
.
items
():
...
...
@@ -75,6 +76,9 @@ if is_cpu():
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
# Since we use _attn_implementation="eager" for hf_runner, here is
# numeric difference for longer context and test can't pass
@
pytest
.
mark
.
xfail
(
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
...
...
vllm/model_executor/models/phi3v.py
View file @
edd5fe5f
...
...
@@ -13,14 +13,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Iterable
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
from
typing
import
Dict
,
Iterable
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
PIL
import
Image
from
transformers
import
CLIPVisionConfig
,
PretrainedConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
VisionLanguageConfig
from
vllm.config
import
CacheConfig
,
ModelConfig
,
VisionLanguageConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
...
...
@@ -32,9 +35,11 @@ from vllm.model_executor.models.llama import LlamaModel
from
vllm.model_executor.models.vlm_base
import
VisionLanguageModelBase
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
get_dummy_image_data
from
vllm.multimodal.image
import
ImagePixelData
,
get_dummy_image_data
from
vllm.sequence
import
SamplerOutput
logger
=
init_logger
(
__name__
)
_KEYS_TO_MODIFY_MAPPING
=
{
"model.vision_embed_tokens"
:
"vision_embed_tokens"
,
}
...
...
@@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict):
"""Shape: (batch_size, 2)"""
@
MULTIMODAL_REGISTRY
.
register_image_pixel_input
()
# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
def
calc_padded_size
(
width
,
height
,
padding_unit
=
336
):
target_height
=
int
(
np
.
ceil
(
height
/
padding_unit
)
*
padding_unit
)
top_padding
=
int
((
target_height
-
height
)
/
2
)
bottom_padding
=
target_height
-
height
-
top_padding
padded_width
=
width
padded_height
=
height
+
top_padding
+
bottom_padding
return
padded_width
,
padded_height
# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
def
calc_hd_transform_size
(
width
,
height
,
hd_num
=
16
):
transposed
=
False
if
width
<
height
:
width
,
height
=
height
,
width
transposed
=
True
ratio
=
width
/
height
scale
=
1
while
scale
*
np
.
ceil
(
scale
/
ratio
)
<=
hd_num
:
scale
+=
1
scale
-=
1
new_width
=
int
(
scale
*
336
)
new_height
=
int
(
new_width
/
ratio
)
padded_width
,
padded_height
=
calc_padded_size
(
new_width
,
new_height
)
if
transposed
:
padded_width
,
padded_height
=
padded_height
,
padded_width
return
padded_width
,
padded_height
def
_image_processor
(
data
:
ImagePixelData
,
model_config
:
ModelConfig
,
vlm_config
:
VisionLanguageConfig
,
)
->
Dict
[
str
,
torch
.
Tensor
]:
image
=
data
.
image
if
isinstance
(
image
,
Image
.
Image
):
# Temporary patch before dynamic number of image tokens is supported
_
,
_
,
h
,
w
=
vlm_config
.
image_input_shape
if
(
w
,
h
)
!=
calc_hd_transform_size
(
image
.
width
,
image
.
height
):
logger
.
warning
(
"Dynamic image shape is currently not supported. "
"Resizing input image to (%d, %d)."
,
w
,
h
)
data
.
image
=
image
.
resize
((
w
,
h
))
return
MULTIMODAL_REGISTRY
.
_get_plugin_for_data_type
(
ImagePixelData
)
\
.
_default_input_processor
(
data
,
model_config
,
vlm_config
)
@
MULTIMODAL_REGISTRY
.
register_image_pixel_input
(
_image_processor
)
@
MULTIMODAL_REGISTRY
.
register_dummy_data
(
get_dummy_image_data
)
class
Phi3VForCausalLM
(
VisionLanguageModelBase
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment