Unverified Commit 85ee1d96 authored by Raushan Turganbay's avatar Raushan Turganbay Committed by GitHub
Browse files

[Bugfix] Fix models and tests for transformers v5 (#33977)


Signed-off-by: default avatarraushan <raushan@huggingface.co>
Signed-off-by: default avatarRaushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 51a7bda6
...@@ -674,7 +674,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -674,7 +674,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|----------------------|---------------------------| |--------------|--------|--------|-------------------|----------------------|---------------------------|
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ | | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ | | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ | | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
......
...@@ -198,10 +198,10 @@ def batch_make_video_embeddings( ...@@ -198,10 +198,10 @@ def batch_make_video_embeddings(
videos += video_batch videos += video_batch
# video to pixel values # video to pixel values
image_processor = processor.image_processor video_processor = processor.video_processor
preprocess_result = image_processor.preprocess( preprocess_result = video_processor.preprocess(
images=None, videos=videos, return_tensors="pt" videos=videos, return_tensors="pt"
).data ).data
pixel_values = preprocess_result["pixel_values_videos"] pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"] video_grid_thw = preprocess_result["video_grid_thw"]
...@@ -222,7 +222,7 @@ def batch_make_video_embeddings( ...@@ -222,7 +222,7 @@ def batch_make_video_embeddings(
embed_counter = 0 embed_counter = 0
for video_batch in video_batches_: for video_batch in video_batches_:
cur_batch_video_count = len(video_batch) cur_batch_video_count = len(video_batch)
merge_size = image_processor.merge_size merge_size = video_processor.merge_size
cur_batch_embed_len = sum( cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[ for grid_thw in video_grid_thw[
......
...@@ -81,7 +81,7 @@ def _run_test( ...@@ -81,7 +81,7 @@ def _run_test(
# Patch the issue where image_token_id # Patch the issue where image_token_id
# exceeds the maximum allowed vocab size # exceeds the maximum allowed vocab size
hf_model.model.resize_token_embeddings( hf_model.model.resize_token_embeddings(
hf_model.model.language_model.vocab_size + 1 hf_model.model.model.language_model.vocab_size + 1
) )
all_inputs = hf_model.get_inputs(input_texts, images=input_images) all_inputs = hf_model.get_inputs(input_texts, images=input_images)
......
...@@ -33,7 +33,9 @@ def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel: ...@@ -33,7 +33,9 @@ def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
model = model_cls._from_config(config) model = model_cls._from_config(config)
# TODO(hmellor): Remove this once Transformers has fixed tied weights on meta device # TODO(hmellor): Remove this once Transformers has fixed tied weights on meta device
# https://github.com/huggingface/transformers/issues/43522 # https://github.com/huggingface/transformers/issues/43522
if getattr(config.get_text_config(), "tie_word_embeddings", False): if getattr(config.get_text_config(), "tie_word_embeddings", False) or getattr(
config, "tie_word_embeddings", False
):
model.tie_weights() model.tie_weights()
return model return model
......
...@@ -236,7 +236,7 @@ class AudioFlamingo3ProcessingInfo(BaseProcessingInfo): ...@@ -236,7 +236,7 @@ class AudioFlamingo3ProcessingInfo(BaseProcessingInfo):
) )
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None} return {"audio": 1}
class AudioFlamingo3DummyInputsBuilder( class AudioFlamingo3DummyInputsBuilder(
......
...@@ -692,7 +692,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo ...@@ -692,7 +692,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
hf_processor = self.info.get_hf_processor() hf_processor = self.info.get_hf_processor(typ=HunYuanVLProcessor)
image_token: str = hf_processor.image_token image_token: str = hf_processor.image_token
return image_token * num_images return image_token * num_images
......
...@@ -13,7 +13,7 @@ import torch.nn as nn ...@@ -13,7 +13,7 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange from einops import rearrange
from transformers.image_processing_utils import BatchFeature from transformers.image_processing_utils import BatchFeature
from transformers.tokenization_utils import TensorType from transformers.utils import TensorType
from typing_extensions import TypedDict, Unpack from typing_extensions import TypedDict, Unpack
from vllm.config import VllmConfig from vllm.config import VllmConfig
......
...@@ -230,8 +230,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support ...@@ -230,8 +230,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
self.vision_feature_layer = config.vision_feature_layer self.vision_feature_layer = config.vision_feature_layer
self.vocab_size = config.text_config.vocab_size self.vocab_size = config.text_config.vocab_size
self.pad_token_id = -1 self.pad_token_id = -1
if self.config.pad_token_id is not None: if self.config.text_config.pad_token_id is not None:
self.pad_token_id = self.config.pad_token_id self.pad_token_id = self.config.text_config.pad_token_id
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors self.language_model.make_empty_intermediate_tensors
......
...@@ -6,10 +6,18 @@ ...@@ -6,10 +6,18 @@
from transformers import AutoProcessor from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class BagelProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"images_kwargs": {
"return_tensors": "pt",
},
}
class BagelProcessor(ProcessorMixin): class BagelProcessor(ProcessorMixin):
""" """
Constructs a BAGEL processor which wraps a Constructs a BAGEL processor which wraps a
...@@ -27,34 +35,32 @@ class BagelProcessor(ProcessorMixin): ...@@ -27,34 +35,32 @@ class BagelProcessor(ProcessorMixin):
| list[TextInput] | list[TextInput]
| list[PreTokenizedInput] = None, | list[PreTokenizedInput] = None,
images: ImageInput = None, images: ImageInput = None,
**kwargs, **kwargs: Unpack[BagelProcessorKwargs],
): ):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). Main method to prepare for the model one or several sequences(s) and image(s).
""" """
output_kwargs = self._merge_kwargs(
BagelProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None: if images is not None:
# Process images with the image processor # Process images with the image processor
# Ensure return_tensors is set to "pt" for PyTorch tensors pixel_values = self.image_processor(
image_kwargs = {**kwargs} images, **output_kwargs["images_kwargs"]
if "return_tensors" not in image_kwargs: )
image_kwargs["return_tensors"] = "pt"
pixel_values = self.image_processor(images, **image_kwargs)
else: else:
pixel_values = None pixel_values = {}
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None text_inputs = (
self.tokenizer(text, **output_kwargs["text_kwargs"])
if text is not None
else {}
)
if pixel_values is not None and text_inputs is not None: return BatchFeature(data={**pixel_values, **text_inputs})
# Combine text and image inputs into BatchFeature
combined = dict(text_inputs)
combined["pixel_values"] = pixel_values["pixel_values"]
return BatchFeature(combined)
elif pixel_values is not None:
return pixel_values
elif text_inputs is not None:
return BatchFeature(dict(text_inputs))
else:
return BatchFeature({})
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
......
...@@ -23,7 +23,6 @@ class HunYuanVLProcessor(ProcessorMixin): ...@@ -23,7 +23,6 @@ class HunYuanVLProcessor(ProcessorMixin):
self, self,
image_processor=None, image_processor=None,
tokenizer=None, tokenizer=None,
video_processor=None,
chat_template=None, chat_template=None,
**kwargs, **kwargs,
): ):
...@@ -42,9 +41,7 @@ class HunYuanVLProcessor(ProcessorMixin): ...@@ -42,9 +41,7 @@ class HunYuanVLProcessor(ProcessorMixin):
) )
self.pad_id = 120002 # self.tokenizer.pad_token_id self.pad_id = 120002 # self.tokenizer.pad_token_id
super().__init__( super().__init__(image_processor, tokenizer, chat_template=chat_template)
image_processor, tokenizer, video_processor, chat_template=chat_template
)
def __call__( def __call__(
self, self,
......
...@@ -43,9 +43,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-a ...@@ -43,9 +43,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-a
"padding": False, "padding": False,
}, },
"images_kwargs": { "images_kwargs": {
"max_partition": 9, "do_convert_rgb": True,
"covering_threshold": 0.9,
"convert_to_rgb": True,
"return_tensors": "pt", "return_tensors": "pt",
}, },
} }
...@@ -143,6 +141,10 @@ class OvisProcessor(ProcessorMixin): ...@@ -143,6 +141,10 @@ class OvisProcessor(ProcessorMixin):
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`. - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`. - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
""" """
max_partition = kwargs.pop("max_partition", 9)
covering_threshold = kwargs.pop("covering_threshold", 0.9)
output_kwargs = self._merge_kwargs( output_kwargs = self._merge_kwargs(
OvisProcessorKwargs, OvisProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs,
...@@ -159,7 +161,10 @@ class OvisProcessor(ProcessorMixin): ...@@ -159,7 +161,10 @@ class OvisProcessor(ProcessorMixin):
# Process each image # Process each image
for image in images if isinstance(images, list) else [images]: for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_image( pixel_values, image_placeholders, grid = self.preprocess_image(
image=image, **output_kwargs["images_kwargs"] image=image,
max_partition=max_partition,
covering_threshold=covering_threshold,
**output_kwargs["images_kwargs"],
) )
processed_images.append(pixel_values) processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders) image_placeholders_list.append(image_placeholders)
...@@ -300,7 +305,7 @@ class OvisProcessor(ProcessorMixin): ...@@ -300,7 +305,7 @@ class OvisProcessor(ProcessorMixin):
image: PIL.Image.Image, image: PIL.Image.Image,
max_partition, max_partition,
covering_threshold, covering_threshold,
convert_to_rgb, do_convert_rgb,
return_tensors, return_tensors,
): ):
def _preprocess(img: PIL.Image.Image, side): def _preprocess(img: PIL.Image.Image, side):
...@@ -394,7 +399,7 @@ class OvisProcessor(ProcessorMixin): ...@@ -394,7 +399,7 @@ class OvisProcessor(ProcessorMixin):
# pick the partition with maximum covering_ratio and break the tie using #sub_images # pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0] return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if convert_to_rgb: if do_convert_rgb:
image = convert_image_mode(image, "RGB") image = convert_image_mode(image, "RGB")
sides = self.get_image_size() sides = self.get_image_size()
......
...@@ -24,14 +24,10 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[cal ...@@ -24,14 +24,10 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[cal
"padding": False, "padding": False,
}, },
"images_kwargs": { "images_kwargs": {
"convert_to_rgb": True, "do_convert_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
}, },
"videos_kwargs": { "videos_kwargs": {
"convert_to_rgb": True, "do_convert_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
}, },
} }
...@@ -160,6 +156,9 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -160,6 +156,9 @@ class Ovis2_5Processor(ProcessorMixin):
- **second_per_grid_ts** -- list of video seconds per time grid. - **second_per_grid_ts** -- list of video seconds per time grid.
Returned when `videos` is not `None`. Returned when `videos` is not `None`.
""" """
min_pixels = kwargs.pop("min_pixels", MIN_PIXELS)
max_pixels = kwargs.pop("max_pixels", MAX_PIXELS)
output_kwargs = self._merge_kwargs( output_kwargs = self._merge_kwargs(
Ovis2_5ProcessorKwargs, Ovis2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs,
...@@ -175,7 +174,10 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -175,7 +174,10 @@ class Ovis2_5Processor(ProcessorMixin):
# Process each image # Process each image
for image in images if isinstance(images, list) else [images]: for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_multidata( pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image, **output_kwargs["images_kwargs"] images=image,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["images_kwargs"],
) )
processed_images.append(pixel_values) processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders) image_placeholders_list.append(image_placeholders)
...@@ -194,7 +196,10 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -194,7 +196,10 @@ class Ovis2_5Processor(ProcessorMixin):
# Process each video # Process each video
for video in videos if isinstance(videos, list) else [videos]: for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = self.preprocess_multidata( pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video, **output_kwargs["videos_kwargs"] video=video,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["videos_kwargs"],
) )
processed_videos.append(pixel_values) processed_videos.append(pixel_values)
videos_placeholders_list.append(video_placeholders) videos_placeholders_list.append(video_placeholders)
...@@ -378,7 +383,7 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -378,7 +383,7 @@ class Ovis2_5Processor(ProcessorMixin):
self, self,
images: PIL.Image.Image | list[PIL.Image.Image] | None = None, images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
video: list[PIL.Image.Image] | np.ndarray | None = None, video: list[PIL.Image.Image] | np.ndarray | None = None,
convert_to_rgb: bool | None = True, do_convert_rgb: bool | None = True,
min_pixels: int = MIN_PIXELS, min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS, max_pixels: int = MAX_PIXELS,
return_tensors: str | None = "pt", return_tensors: str | None = "pt",
...@@ -404,7 +409,7 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -404,7 +409,7 @@ class Ovis2_5Processor(ProcessorMixin):
min_pixels if min_pixels is not None else MIN_PIXELS, min_pixels if min_pixels is not None else MIN_PIXELS,
) )
images = [ images = [
image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image image.convert("RGB") if do_convert_rgb and image.mode != "RGB" else image
for image in images for image in images
] ]
...@@ -420,9 +425,9 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -420,9 +425,9 @@ class Ovis2_5Processor(ProcessorMixin):
max_pixels=max_pixels, max_pixels=max_pixels,
) )
new_size = dict(height=resized_height, width=resized_width) new_size = dict(height=resized_height, width=resized_width)
image_pt = self.image_processor.preprocess( image_pt = self.image_processor.preprocess(image, size=new_size)[
image, size=new_size, return_tensors="np" "pixel_values"
)["pixel_values"][0] ][0]
processed_images.append(image_pt) processed_images.append(image_pt)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment