Unverified Commit 527ca321 authored by Raushan Turganbay's avatar Raushan Turganbay Committed by GitHub
Browse files

[Bugfix] Fix more multimodal tests for transformers V5 (#34334)


Signed-off-by: default avatarraushan <raushan@huggingface.co>
parent 5458eb83
......@@ -108,6 +108,7 @@ _ADD_SPECIAL_TOKENS_OVERRIDES = {
"paligemma": False,
"ultravox": False,
"whisper": False,
"lfm2_vl": False,
}
_IGNORE_MM_KEYS = {
......
......@@ -810,9 +810,9 @@ class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"])
# Postprocess: rename mask and add chunk counts
# Handle different key names from different transformers versions
if "input_feature_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_feature_mask")
elif "feature_attention_mask" not in outputs and "input_features" in outputs:
if "input_features_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
elif "input_features_mask" not in outputs and "input_features" in outputs:
# If no mask is provided, create one from input_features
input_features = outputs["input_features"]
if isinstance(input_features, torch.Tensor):
......
......@@ -18,8 +18,8 @@ def _calculate_conv_output_length(
input_length: torch.Tensor, padding: int, kernel_size: int, stride: int
) -> torch.Tensor:
"""Calculate Conv1d output length using standard formula."""
# Standard formula: floor((input + 2*padding - kernel_size) / stride) + 1
return (input_length + 2 * padding - kernel_size) // stride + 1
# in sync with `hf_processor._get_audio_token_length`
return (input_length + 2 * padding - (kernel_size - 1) - 1) // stride + 1
def _as_list_chunk_counts(
......
......@@ -347,7 +347,9 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
) -> BatchFeature:
# Text-only input not supported in composite processor
if not (images := mm_data.get("images", [])):
prompt_ids = self.info.get_tokenizer().encode(prompt)
prompt_ids = self.info.get_tokenizer().encode(
prompt, add_special_tokens=False
)
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
......
......@@ -1467,15 +1467,15 @@ class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
class Tarsier2Processor(Qwen2VLProcessor):
def __init__(
self,
vision_config: dict,
image_processor: Tarsier2ImageProcessor,
tokenizer: TokenizerLike,
video_processor: Qwen2VLVideoProcessor,
**kwargs,
):
self.image_processor = Tarsier2ImageProcessor(**vision_config)
super().__init__(
image_processor=self.image_processor,
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=Qwen2VLVideoProcessor(**vision_config),
video_processor=video_processor,
chat_template=None,
**kwargs,
)
......@@ -1489,8 +1489,12 @@ class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
return correct_config
def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
vision_config = self.ctx.get_hf_image_processor_config()
image_processor = Tarsier2ImageProcessor(**vision_config)
video_processor = Qwen2VLVideoProcessor(**vision_config)
return Tarsier2Processor(
vision_config=self.ctx.get_hf_image_processor_config(),
image_processor=image_processor,
video_processor=video_processor,
tokenizer=self.get_tokenizer(),
**kwargs,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment