Add video modality for InstrucBLIP (#30182)

* squash in single commit * add docs * dummy obj * more changes in diff converter * tiny fix * make docs happy * skip test * repo consistency tests * update docstring * style * fix tests * change diff imports * [run-slow] instructblipvideo * [run-slow] instructblipvideo * fix tests and remove logit check * [run-slow] instructblipvideo

Add video modality for InstrucBLIP (#30182)
* squash in single commit * add docs * dummy obj * more changes in diff converter * tiny fix * make docs happy * skip test * repo consistency tests * update docstring * style * fix tests * change diff imports * [run-slow] instructblipvideo * [run-slow] instructblipvideo * fix tests and remove logit check * [run-slow] instructblipvideo
fc689d75 · Raushan Turganbay · GitHub · a958c4a8 · fc689d75 · fc689d75
Unverified Commit fc689d75 authored Jun 25, 2024 by Raushan Turganbay Committed by GitHub Jun 25, 2024
8 changed files
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+
+import os
+from typing import List, Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import VideoInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+from ..auto import AutoTokenizer
+
+
+class InstructBlipVideoProcessor(ProcessorMixin):
+    r"""
+    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+
+    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`InstructBlipVideoImageProcessor`):
+            An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        qformer_tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "InstructBlipVideoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+        # add QFormer tokenizer
+        self.qformer_tokenizer = qformer_tokenizer
+
+    def __call__(
+        self,
+        images: VideoInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        encoding = BatchFeature()
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding.update(text_encoding)
+            qformer_text_encoding = self.qformer_tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+
+        if images is not None:
+            image_encoding = self.image_processor(images, return_tensors=return_tensors)
+            encoding.update(image_encoding)
+
+        return encoding
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    # overwrite to save the Q-Former tokenizer in a separate folder
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
+        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
+        return super().save_pretrained(save_directory, **kwargs)
+
+    # overwrite to load the Q-Former tokenizer from a separate folder
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        args.append(qformer_tokenizer)
+        return cls(*args)
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4755,6 +4755,34 @@ class InstructBlipVisionModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipVideoPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipVideoQFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipVideoVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class JambaForCausalLM(metaclass=DummyObject):
    _backends = ["torch"]


--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -303,6 +303,13 @@ class ImageGPTImageProcessor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class InstructBlipVideoImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]


--- a/tests/models/instructblipvideo/__init__.py
+++ b/tests/models/instructblipvideo/__init__.py
--- a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py
+++ b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import InstructBlipVideoImageProcessor
+
+
+class InstructBlipVideoProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_channels=3,
+        image_size=24,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+        frames=4,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.frames = frames
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.frames, self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        images = prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+        # let's simply copy the frames to fake a long video-clip
+        if numpify or torchify:
+            videos = []
+            for image in images:
+                if numpify:
+                    video = image[None, ...].repeat(self.frames, 0)
+                else:
+                    video = image[None, ...].repeat(self.frames, 1, 1, 1)
+                videos.append(video)
+        else:
+            videos = []
+            for pil_image in images:
+                videos.append([pil_image] * self.frames)
+
+        return videos
+
+
+@require_torch
+@require_vision
+class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = InstructBlipVideoProcessingTester(self)
+
+    @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+        for video in video_inputs:
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
+        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = (1, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
+
+        # Test batched
+        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = (5, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, np.ndarray)
+
+        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
+        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = (1, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
+
+        # Test batched
+        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = (5, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = (1, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
+
+        # Test batched
+        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = (5, 4, 3, 18, 18)
+        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch InstructBlipVideo model."""
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    CONFIG_MAPPING,
+    InstructBlipVideoConfig,
+    InstructBlipVideoProcessor,
+    InstructBlipVideoQFormerConfig,
+    InstructBlipVideoVisionConfig,
+)
+from transformers.testing_utils import (
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel
+
+
+if is_vision_available():
+    pass
+
+
+class InstructBlipVideoVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        frames=4,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.frames = frames
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size * self.frames, self.num_channels, self.image_size, self.image_size]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return InstructBlipVideoVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = InstructBlipVideoVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size * self.frames, num_patches + 1, self.hidden_size)
+        )
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.frames, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as InstructBlipVideo's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (InstructBlipVideoVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipVideoVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=InstructBlipVideoVisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="InstructBlipVideo's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVideo's vision encoder is an nn.Embeddings layer")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(
+        reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training"
+    )
+    def test_training(self):
+        pass
+
+    @unittest.skip(
+        reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Salesforce/instructblip-vicuna-7b"
+        model = InstructBlipVideoVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class InstructBlipVideoQFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask
+
+    def get_config(self):
+        return InstructBlipVideoQFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+
+# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
+class InstructBlipVideoTextModelDecoderOnlyTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=100,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        embed_dim=16,
+        num_labels=3,
+        word_embed_proj_dim=16,
+        type_sequence_label_size=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.embed_dim = embed_dim
+        self.num_labels = num_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.word_embed_proj_dim = word_embed_proj_dim
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return CONFIG_MAPPING["opt"](
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            embed_dim=self.embed_dim,
+            is_encoder_decoder=False,
+            word_embed_proj_dim=self.word_embed_proj_dim,
+        )
+
+
+# this model tester uses a decoder-only language model (OPT)
+class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
+    def __init__(
+        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+    ):
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        if qformer_kwargs is None:
+            qformer_kwargs = {}
+        if text_kwargs is None:
+            text_kwargs = {}
+
+        self.parent = parent
+        self.vision_model_tester = InstructBlipVideoVisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
+        self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.is_training = is_training
+        self.num_query_tokens = num_query_tokens
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        frames = self.vision_model_tester.frames
+        _, c, h, w = pixel_values.shape
+        pixel_values = pixel_values.reshape(-1, frames, c, h, w)
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+
+    def get_config(self):
+        return InstructBlipVideoConfig.from_vision_qformer_text_configs(
+            vision_config=self.vision_model_tester.get_config(),
+            qformer_config=self.qformer_model_tester.get_config(),
+            text_config=self.text_model_tester.get_config(),
+            num_query_tokens=self.num_query_tokens,
+        )
+
+    def create_and_check_for_conditional_generation(
+        self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+    ):
+        model = InstructBlipVideoForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(
+                pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                qformer_input_ids=qformer_input_ids,
+                qformer_attention_mask=qformer_attention_mask,
+            )
+
+        expected_seq_length = (
+            self.num_query_tokens * self.vision_model_tester.frames
+        ) + self.text_model_tester.seq_length
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "qformer_input_ids": qformer_input_ids,
+            "qformer_attention_mask": qformer_attention_mask,
+            "labels": input_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
+    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
+):
+    all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
+
+    def test_for_conditional_generation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVideoForConditionalGeneration doesn't support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Tied weights are tested in individual model tests")
+    def test_tied_weights_keys(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVideoModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipVideoModel")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipVideoModel")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_load_vision_qformer_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = InstructBlipVideoVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoQFormerConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            qformer_config = InstructBlipVideoQFormerConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Salesforce/instructblip-vicuna-7b"
+        model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_video():
+    video_file = hf_hub_download(
+        repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+    )
+    video = np.load(video_file)[::2]  # sample every 2nd frame to get 4 frames total
+    return video
+
+
+@require_vision
+@require_torch
+@require_bitsandbytes
+@require_accelerate
+@slow
+class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
+    def test_inference_vicuna_7b(self):
+        processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        model = InstructBlipVideoForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+        )
+
+        clip = prepare_video()
+        prompt = "Explain what is happening in this short video."
+        inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
+
+        # verify generation
+        outputs = model.generate(**inputs, max_new_tokens=30)
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        self.assertEqual(
+            generated_text,
+            "a baby girl wearing glasses is reading a book on the bed 1080p",
+        )
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -90,6 +90,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
    "RecurrentGemmaModel",  # Building part of bigger (tested) model.
    "FuyuForCausalLM",  # Not tested fort now
    "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
+    "InstructBlipVideoQFormerModel",  # Building part of bigger (tested) model.
    "UMT5EncoderModel",  # Building part of bigger (tested) model.
    "Blip2QFormerModel",  # Building part of bigger (tested) model.
    "ErnieMForInformationExtraction",
@@ -245,6 +246,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "GPTSw3DoubleHeadsModel",
    "InstructBlipVisionModel",
    "InstructBlipQFormerModel",
+    "InstructBlipVideoVisionModel",
+    "InstructBlipVideoQFormerModel",
    "LayoutLMForQuestionAnswering",
    "LukeForMaskedLM",
    "LukeForEntityClassification",

--- a/utils/diff_model_converter.py
+++ b/utils/diff_model_converter.py
@@ -173,7 +173,7 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
        - LLaMa -> MyNewModel       abd     MyNewModel      -> Llama
    """

-    def __init__(self, old_name, new_name):
+    def __init__(self, old_name, new_name, given_old_name=None, given_new_name=None):
        super().__init__()
        self.old_name = old_name
        self.new_name = new_name
@@ -183,6 +183,8 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
            old_name.upper(): new_name.upper(),
            "".join(x.title() for x in old_name.split("_")): self.default_name,
        }
+        if given_old_name is not None and given_new_name is not None and given_old_name not in self.patterns:
+            self.patterns[given_old_name] = given_new_name

    def preserve_case_replace(self, text):
        # Create a regex pattern to match all variations
@@ -201,9 +203,9 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
        return updated_node.with_changes(value=update)


-def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma"):
+def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma", given_old_name=None, given_new_name=None):
    """Helper function to rename and then parse a source file using the ClassFinder"""
-    transformer = ReplaceNameTransformer(old_id, new_id)
+    transformer = ReplaceNameTransformer(old_id, new_id, given_old_name, given_new_name)
    new_module = module.visit(transformer)

    wrapper = MetadataWrapper(new_module)
@@ -356,11 +358,13 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
 class DiffConverterTransformer(CSTTransformer):
    METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)

-    def __init__(self, python_module, new_name):
+    def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None):
        super().__init__()
        self.model_name = (
            new_name  # name of the model being defined. Should be in the format of `llama` or `layout_xlm` our `phi3`
        )
+        self.given_old_name = given_old_name
+        self.given_new_name = given_new_name
        # fmt: off
        self.python_module = python_module  # we store the original module to use `code_for_node`
        self.transformers_imports = {}      # maps the imports name like "from transformers.models.xxx" to the parsed AST module
@@ -426,6 +430,7 @@ class DiffConverterTransformer(CSTTransformer):
                "insert_idx": self.global_scope_index,
                "node": updated_node,
            }
+            self.config_body = [updated_node]
        return updated_node

    def leave_ClassDef(self, original_node, updated_node):
@@ -457,13 +462,18 @@ class DiffConverterTransformer(CSTTransformer):
                    f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name"
                )

-            if super_file_name not in self.visited_module:  # only extract classes once
+            visited_module = self.visited_module
+            if super_file_name not in visited_module:  # only extract classes once
                class_finder = find_classes_in_file(
-                    self.transformers_imports[super_file_name], model_name, self.model_name
+                    self.transformers_imports[super_file_name],
+                    model_name,
+                    self.model_name,
+                    self.given_old_name,
+                    self.given_new_name,
                )
-                self.visited_module[super_file_name] = class_finder
+                visited_module[super_file_name] = class_finder
            else:  # we are re-using the previously parsed data
-                class_finder = self.visited_module[super_file_name]
+                class_finder = visited_module[super_file_name]

            list_dependencies = {
                dep: class_finder.class_start_line.get(dep, 1000)
@@ -474,7 +484,7 @@ class DiffConverterTransformer(CSTTransformer):
            start_insert_idx = self.global_scope_index
            for dependency, _ in list_dependencies:
                node = class_finder.global_nodes.get(dependency, None)
-                if node is not None:
+                if node is not None and "Config" not in class_name:
                    if dependency not in self.new_body:
                        start_insert_idx -= 1
                        self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node}
@@ -485,7 +495,7 @@ class DiffConverterTransformer(CSTTransformer):
            if len(list_dependencies) > 0:
                updated_node = replace_call_to_super(class_finder, updated_node, class_name)
        if "Config" in class_name:
-            self.config_body = [updated_node]
+            self.config_body += [updated_node]
        else:
            self.new_body[class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
        return updated_node
@@ -503,10 +513,24 @@ class DiffConverterTransformer(CSTTransformer):
    def leave_Module(self, original_node: cst.Assign, node):
        imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
        dependency_imports = {}
+        config_imports = []
        for visiter in self.visited_module.values():
            dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()})
+
+        # manually clean up if it's importing a config from configuration file (ruff doesn't do that)
+        config_imports = []
+        for i in list(dependency_imports.values()):
+            if (
+                hasattr(i.body[0], "module")
+                and isinstance(i.body[0].module, cst.Name)
+                and f"configuration_{self.model_name}" in i.body[0].module.value
+            ):
+                pass
+            else:
+                config_imports.append(i)
+
        if hasattr(self, "config_body"):
-            self.config_body = list(imports.values()) + self.config_body
+            self.config_body = list(imports.values()) + config_imports + self.config_body
        dependency_imports.update(imports)
        new_body = list(dependency_imports.values())
        if len(self.new_body.keys()) > 0:
@@ -516,7 +540,7 @@ class DiffConverterTransformer(CSTTransformer):
        return node.with_changes(body=[*new_body])


-def convert_file(diff_file, cst_transformers=None):
+def convert_file(diff_file, old_model_name=None, new_model_name=None, cst_transformers=None):
    model_name = re.search(r"diff_(.*)(?=\.py$)", diff_file).groups()[0]
    # Parse the Python file
    with open(diff_file, "r") as file:
@@ -524,7 +548,7 @@ def convert_file(diff_file, cst_transformers=None):
    module = cst.parse_module(code)
    wrapper = MetadataWrapper(module)
    if cst_transformers is None:
-        cst_transformers = DiffConverterTransformer(module, model_name)
+        cst_transformers = DiffConverterTransformer(module, model_name, old_model_name, new_model_name)
    new_mod = wrapper.visit(cst_transformers)
    ruffed_code = run_ruff(new_mod.code, True)
    formatted_code = run_ruff(ruffed_code, False)
@@ -551,10 +575,20 @@ if __name__ == "__main__":
        nargs="+",
        help="A list of `diff_xxxx` files that should be converted to single model file",
    )
+    parser.add_argument(
+        "--old_model_name",
+        required=False,
+        help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from diff-file",
+    )
+    parser.add_argument(
+        "--new_model_name",
+        required=False,
+        help="The name of the new model being added in CamelCase. If not provided is inferred from diff-file",
+    )
    args = parser.parse_args()
    if args.files_to_parse == ["all"]:
        args.files_to_parse = glob.glob("src/transformers/models/**/diff_*.py", recursive=True)
    for file_name in args.files_to_parse:
        print(f"Converting {file_name} to a single model single file format")
        module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "")
-        converter = convert_file(file_name)
+        converter = convert_file(file_name, args.old_model_name, args.new_model_name)