Unverified Commit 999df95b authored by Jiahao Li's avatar Jiahao Li Committed by GitHub
Browse files

[Bugfix] Make image processor respect `mm_processor_kwargs` for Qwen2-VL (#10112)


Signed-off-by: default avatarJiahao Li <liplus17@163.com>
parent a6f332d0
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
# limitations under the License. # limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights.""" """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from functools import partial from functools import partial
from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
Tuple, Type, TypedDict, Union) Optional, Tuple, Type, TypedDict, Union)
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -558,6 +558,17 @@ class Qwen2VisionTransformer(nn.Module): ...@@ -558,6 +558,17 @@ class Qwen2VisionTransformer(nn.Module):
# === Vision input helpers === # # === Vision input helpers === #
def get_mm_processor_kwargs(
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None) -> Dict[str, int]:
mm_processor_kwargs = {}
if min_pixels:
mm_processor_kwargs["min_pixels"] = min_pixels
if max_pixels:
mm_processor_kwargs["max_pixels"] = max_pixels
return mm_processor_kwargs
def mm_input_mapper_for_qwen2_vl( def mm_input_mapper_for_qwen2_vl(
ctx: InputContext, ctx: InputContext,
data: MultiModalData[object], data: MultiModalData[object],
...@@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl( ...@@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl(
model_config = ctx.model_config model_config = ctx.model_config
# Handle mm processor kwargs; we pass these at creation time # Handle mm processor kwargs; we pass these at creation time
# because preprocess() in transformers doesn't expose them # because preprocess() in transformers doesn't expose them
mm_processor_kwargs = {} mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
if min_pixels: max_pixels=max_pixels)
mm_processor_kwargs["min_pixels"] = min_pixels
if max_pixels:
mm_processor_kwargs["max_pixels"] = max_pixels
image_processor = cached_get_image_processor( image_processor = cached_get_image_processor(
model_config.model, model_config.model,
trust_remote_code=model_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
...@@ -683,7 +690,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, ...@@ -683,7 +690,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
*, *,
min_pixels=None, min_pixels=None,
max_pixels=None) -> int: max_pixels=None) -> int:
image_processor = cached_get_image_processor(ctx.model_config.model) mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs)
max_resized_height, max_resized_width, max_llm_image_tokens = \ max_resized_height, max_resized_width, max_llm_image_tokens = \
_get_max_image_info(image_processor, data_type_key=data_type_key, _get_max_image_info(image_processor, data_type_key=data_type_key,
mm_count=1, min_pixels=min_pixels, mm_count=1, min_pixels=min_pixels,
...@@ -705,7 +715,10 @@ def dummy_data_for_qwen2_vl( ...@@ -705,7 +715,10 @@ def dummy_data_for_qwen2_vl(
min_pixels: Optional[int] = None, min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None max_pixels: Optional[int] = None
) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
image_processor = cached_get_image_processor(ctx.model_config.model) mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs)
num_images = mm_counts["image"] num_images = mm_counts["image"]
max_resized_height, max_resized_width, max_llm_image_tokens = \ max_resized_height, max_resized_width, max_llm_image_tokens = \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment