Unverified Commit fc689d75 authored by Raushan Turganbay's avatar Raushan Turganbay Committed by GitHub
Browse files

Add video modality for InstrucBLIP (#30182)

* squash in single commit

* add docs

* dummy obj

* more changes in diff converter

* tiny fix

* make docs happy

* skip test

* repo consistency tests

* update docstring

* style

* fix tests

* change diff imports

* [run-slow] instructblipvideo

* [run-slow] instructblipvideo

* fix tests and remove logit check

* [run-slow] instructblipvideo
parent a958c4a8
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
"""
import os
from typing import List, Optional, Union
from ...image_processing_utils import BatchFeature
from ...image_utils import VideoInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
from ..auto import AutoTokenizer
class InstructBlipVideoProcessor(ProcessorMixin):
r"""
Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
processor.
[`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.
Args:
image_processor (`InstructBlipVideoImageProcessor`):
An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input.
tokenizer (`AutoTokenizer`):
An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
qformer_tokenizer (`AutoTokenizer`):
An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "InstructBlipVideoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer, qformer_tokenizer):
super().__init__(image_processor, tokenizer)
# add QFormer tokenizer
self.qformer_tokenizer = qformer_tokenizer
def __call__(
self,
images: VideoInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchFeature:
"""
This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
encoding = BatchFeature()
if text is not None:
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
encoding.update(text_encoding)
qformer_text_encoding = self.qformer_tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
if images is not None:
image_encoding = self.image_processor(images, return_tensors=return_tensors)
encoding.update(image_encoding)
return encoding
# Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
# Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
# overwrite to save the Q-Former tokenizer in a separate folder
def save_pretrained(self, save_directory, **kwargs):
if os.path.isfile(save_directory):
raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
return super().save_pretrained(save_directory, **kwargs)
# overwrite to load the Q-Former tokenizer from a separate folder
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
args.append(qformer_tokenizer)
return cls(*args)
......@@ -4755,6 +4755,34 @@ class InstructBlipVisionModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipVideoPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipVideoQFormerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipVideoVisionModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class JambaForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
......
......@@ -303,6 +303,13 @@ class ImageGPTImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class InstructBlipVideoImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import InstructBlipVideoImageProcessor
class InstructBlipVideoProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=5,
num_channels=3,
image_size=24,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
frames=4,
):
size = size if size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.frames = frames
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
def expected_output_image_shape(self, images):
return self.frames, self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
images = prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
# let's simply copy the frames to fake a long video-clip
if numpify or torchify:
videos = []
for image in images:
if numpify:
video = image[None, ...].repeat(self.frames, 0)
else:
video = image[None, ...].repeat(self.frames, 1, 1, 1)
videos.append(video)
else:
videos = []
for pil_image in images:
videos.append([pil_image] * self.frames)
return videos
@require_torch
@require_vision
class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = InstructBlipVideoProcessingTester(self)
@property
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for video in video_inputs:
self.assertIsInstance(video[0], Image.Image)
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
expected_output_video_shape = (1, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
expected_output_video_shape = (5, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, np.ndarray)
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
expected_output_video_shape = (1, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
expected_output_video_shape = (5, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for video in video_inputs:
self.assertIsInstance(video, torch.Tensor)
# Test not batched input
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
expected_output_video_shape = (1, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
expected_output_video_shape = (5, 4, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
......@@ -90,6 +90,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"RecurrentGemmaModel", # Building part of bigger (tested) model.
"FuyuForCausalLM", # Not tested fort now
"InstructBlipQFormerModel", # Building part of bigger (tested) model.
"InstructBlipVideoQFormerModel", # Building part of bigger (tested) model.
"UMT5EncoderModel", # Building part of bigger (tested) model.
"Blip2QFormerModel", # Building part of bigger (tested) model.
"ErnieMForInformationExtraction",
......@@ -245,6 +246,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"GPTSw3DoubleHeadsModel",
"InstructBlipVisionModel",
"InstructBlipQFormerModel",
"InstructBlipVideoVisionModel",
"InstructBlipVideoQFormerModel",
"LayoutLMForQuestionAnswering",
"LukeForMaskedLM",
"LukeForEntityClassification",
......
......@@ -173,7 +173,7 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
- LLaMa -> MyNewModel abd MyNewModel -> Llama
"""
def __init__(self, old_name, new_name):
def __init__(self, old_name, new_name, given_old_name=None, given_new_name=None):
super().__init__()
self.old_name = old_name
self.new_name = new_name
......@@ -183,6 +183,8 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
old_name.upper(): new_name.upper(),
"".join(x.title() for x in old_name.split("_")): self.default_name,
}
if given_old_name is not None and given_new_name is not None and given_old_name not in self.patterns:
self.patterns[given_old_name] = given_new_name
def preserve_case_replace(self, text):
# Create a regex pattern to match all variations
......@@ -201,9 +203,9 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
return updated_node.with_changes(value=update)
def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma"):
def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma", given_old_name=None, given_new_name=None):
"""Helper function to rename and then parse a source file using the ClassFinder"""
transformer = ReplaceNameTransformer(old_id, new_id)
transformer = ReplaceNameTransformer(old_id, new_id, given_old_name, given_new_name)
new_module = module.visit(transformer)
wrapper = MetadataWrapper(new_module)
......@@ -356,11 +358,13 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
class DiffConverterTransformer(CSTTransformer):
METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)
def __init__(self, python_module, new_name):
def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None):
super().__init__()
self.model_name = (
new_name # name of the model being defined. Should be in the format of `llama` or `layout_xlm` our `phi3`
)
self.given_old_name = given_old_name
self.given_new_name = given_new_name
# fmt: off
self.python_module = python_module # we store the original module to use `code_for_node`
self.transformers_imports = {} # maps the imports name like "from transformers.models.xxx" to the parsed AST module
......@@ -426,6 +430,7 @@ class DiffConverterTransformer(CSTTransformer):
"insert_idx": self.global_scope_index,
"node": updated_node,
}
self.config_body = [updated_node]
return updated_node
def leave_ClassDef(self, original_node, updated_node):
......@@ -457,13 +462,18 @@ class DiffConverterTransformer(CSTTransformer):
f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name"
)
if super_file_name not in self.visited_module: # only extract classes once
visited_module = self.visited_module
if super_file_name not in visited_module: # only extract classes once
class_finder = find_classes_in_file(
self.transformers_imports[super_file_name], model_name, self.model_name
self.transformers_imports[super_file_name],
model_name,
self.model_name,
self.given_old_name,
self.given_new_name,
)
self.visited_module[super_file_name] = class_finder
visited_module[super_file_name] = class_finder
else: # we are re-using the previously parsed data
class_finder = self.visited_module[super_file_name]
class_finder = visited_module[super_file_name]
list_dependencies = {
dep: class_finder.class_start_line.get(dep, 1000)
......@@ -474,7 +484,7 @@ class DiffConverterTransformer(CSTTransformer):
start_insert_idx = self.global_scope_index
for dependency, _ in list_dependencies:
node = class_finder.global_nodes.get(dependency, None)
if node is not None:
if node is not None and "Config" not in class_name:
if dependency not in self.new_body:
start_insert_idx -= 1
self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node}
......@@ -485,7 +495,7 @@ class DiffConverterTransformer(CSTTransformer):
if len(list_dependencies) > 0:
updated_node = replace_call_to_super(class_finder, updated_node, class_name)
if "Config" in class_name:
self.config_body = [updated_node]
self.config_body += [updated_node]
else:
self.new_body[class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
return updated_node
......@@ -503,10 +513,24 @@ class DiffConverterTransformer(CSTTransformer):
def leave_Module(self, original_node: cst.Assign, node):
imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
dependency_imports = {}
config_imports = []
for visiter in self.visited_module.values():
dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()})
# manually clean up if it's importing a config from configuration file (ruff doesn't do that)
config_imports = []
for i in list(dependency_imports.values()):
if (
hasattr(i.body[0], "module")
and isinstance(i.body[0].module, cst.Name)
and f"configuration_{self.model_name}" in i.body[0].module.value
):
pass
else:
config_imports.append(i)
if hasattr(self, "config_body"):
self.config_body = list(imports.values()) + self.config_body
self.config_body = list(imports.values()) + config_imports + self.config_body
dependency_imports.update(imports)
new_body = list(dependency_imports.values())
if len(self.new_body.keys()) > 0:
......@@ -516,7 +540,7 @@ class DiffConverterTransformer(CSTTransformer):
return node.with_changes(body=[*new_body])
def convert_file(diff_file, cst_transformers=None):
def convert_file(diff_file, old_model_name=None, new_model_name=None, cst_transformers=None):
model_name = re.search(r"diff_(.*)(?=\.py$)", diff_file).groups()[0]
# Parse the Python file
with open(diff_file, "r") as file:
......@@ -524,7 +548,7 @@ def convert_file(diff_file, cst_transformers=None):
module = cst.parse_module(code)
wrapper = MetadataWrapper(module)
if cst_transformers is None:
cst_transformers = DiffConverterTransformer(module, model_name)
cst_transformers = DiffConverterTransformer(module, model_name, old_model_name, new_model_name)
new_mod = wrapper.visit(cst_transformers)
ruffed_code = run_ruff(new_mod.code, True)
formatted_code = run_ruff(ruffed_code, False)
......@@ -551,10 +575,20 @@ if __name__ == "__main__":
nargs="+",
help="A list of `diff_xxxx` files that should be converted to single model file",
)
parser.add_argument(
"--old_model_name",
required=False,
help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from diff-file",
)
parser.add_argument(
"--new_model_name",
required=False,
help="The name of the new model being added in CamelCase. If not provided is inferred from diff-file",
)
args = parser.parse_args()
if args.files_to_parse == ["all"]:
args.files_to_parse = glob.glob("src/transformers/models/**/diff_*.py", recursive=True)
for file_name in args.files_to_parse:
print(f"Converting {file_name} to a single model single file format")
module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "")
converter = convert_file(file_name)
converter = convert_file(file_name, args.old_model_name, args.new_model_name)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment