Add LLaVa-1.6, bis (#29586)

* First draft * Fix tests, add docs * Improve docstrings * Fix test * Address comments * Address comments * Remove vocab_size attribute * Remove batch_size * Address comment * Add image processor tests * Support fx * Update docstring * Add support for 34b * Convert 34b model * Add integration tests * Update checkpoints * Convert vicuna-13b, remove doc tests * Remove script * Remove file * Address comments * Improve docstrings * Deprecate vocab_size * Remove aspect_ratio_setting * Address comments * Update READMEs * Add tips about chat templates * Fix tests * Deprecate vocab_size safely * Update tests --------- Co-authored-by: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>

Add LLaVa-1.6, bis (#29586)
* First draft * Fix tests, add docs * Improve docstrings * Fix test * Address comments * Address comments * Remove vocab_size attribute * Remove batch_size * Address comment * Add image processor tests * Support fx * Update docstring * Add support for 34b * Convert 34b model * Add integration tests * Update checkpoints * Convert vicuna-13b, remove doc tests * Remove script * Remove file * Address comments * Improve docstrings * Deprecate vocab_size * Remove aspect_ratio_setting * Address comments * Update READMEs * Add tips about chat templates * Fix tests * Deprecate vocab_size safely * Update tests --------- Co-authored-by: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
d91fd7f9 · NielsRogge · GitHub · 9d999481 · d91fd7f9 · d91fd7f9
Unverified Commit d91fd7f9 authored Mar 20, 2024 by NielsRogge Committed by GitHub Mar 20, 2024
20 changed files
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,6 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("layoutlmv3", "LayoutLMv3ImageProcessor"),
        ("levit", "LevitImageProcessor"),
        ("llava", "CLIPImageProcessor"),
+        ("llava_next", "CLIPImageProcessor"),
        ("mask2former", "Mask2FormerImageProcessor"),
        ("maskformer", "MaskFormerImageProcessor"),
        ("mgp-str", "ViTImageProcessor"),

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -286,6 +286,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
        ("idefics", "IdeficsForVisionText2Text"),
        ("layoutlm", "LayoutLMForMaskedLM"),
        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
        ("longformer", "LongformerForMaskedLM"),
        ("luke", "LukeForMaskedLM"),
        ("lxmert", "LxmertForPreTraining"),
@@ -675,6 +676,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
        ("instructblip", "InstructBlipForConditionalGeneration"),
        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
        ("pix2struct", "Pix2StructForConditionalGeneration"),
        ("vipllava", "VipLlavaForConditionalGeneration"),
        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),

--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -66,6 +66,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("layoutlmv2", "LayoutLMv2Processor"),
        ("layoutlmv3", "LayoutLMv3Processor"),
        ("llava", "LlavaProcessor"),
+        ("llava_next", "LlavaNextProcessor"),
        ("markuplm", "MarkupLMProcessor"),
        ("mctct", "MCTCTProcessor"),
        ("mgp-str", "MgpstrProcessor"),

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -223,6 +223,7 @@ else:
                ),
            ),
            ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
            (
                "longt5",

--- a/src/transformers/models/llava/__init__.py
+++ b/src/transformers/models/llava/__init__.py
@@ -16,7 +16,10 @@ from typing import TYPE_CHECKING
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
-_import_structure = {"configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"]}
+_import_structure = {
+    "configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"],
+    "processing_llava": ["LlavaProcessor"],
+}
 try:
@@ -30,11 +33,11 @@ else:
        "LlavaForConditionalGeneration",
        "LlavaPreTrainedModel",
    ]
-    _import_structure["processing_llava"] = ["LlavaProcessor"]
 if TYPE_CHECKING:
    from .configuration_llava import LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaConfig
+    from .processing_llava import LlavaProcessor
    try:
        if not is_torch_available():
@@ -47,8 +50,6 @@ if TYPE_CHECKING:
            LlavaForConditionalGeneration,
            LlavaPreTrainedModel,
        )
-        from .processing_llava import LlavaProcessor
 else:
    import sys

--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """ Llava model configuration"""
+import warnings
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING
@@ -37,10 +39,10 @@ class LlavaConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.
    Args:
-        vision_config (`LlavaVisionConfig`,  *optional*):
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
-            Custom vision config or dict
+            The config object or dictionary of the vision backbone.
-        text_config (`Union[AutoConfig, dict]`, *optional*):
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
-            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+            The config object or dictionary of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        image_token_index (`int`, *optional*, defaults to 32000):
@@ -48,12 +50,10 @@ class LlavaConfig(PretrainedConfig):
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function used by the multimodal projector.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the CLIP backbone.
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
        vision_feature_layer (`int`, *optional*, defaults to -2):
            The index of the layer to select the vision feature.
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
    Example:
@@ -88,25 +88,34 @@ class LlavaConfig(PretrainedConfig):
        projector_hidden_act="gelu",
        vision_feature_select_strategy="default",
        vision_feature_layer=-2,
-        vocab_size=32000,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+        if "vocab_size" in kwargs:
+            warnings.warn(
+                "The `vocab_size` argument is deprecated and will be removed in v4.42, since it can be inferred from the `text_config`. Passing this argument has no effect",
+                FutureWarning,
+            )
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.vision_feature_layer = vision_feature_layer
-        self.vocab_size = vocab_size
-        self.vision_config = vision_config
+        if isinstance(vision_config, dict):
-        if isinstance(self.vision_config, dict):
            vision_config["model_type"] = (
                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
            )
-            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
        elif vision_config is None:
-            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
                intermediate_size=4096,
                hidden_size=1024,
                patch_size=14,
@@ -116,15 +125,29 @@ class LlavaConfig(PretrainedConfig):
                vocab_size=32000,
                projection_dim=768,
            )
-        self.vocab_size = self.vocab_size
-        self.text_config = text_config
+        self.vision_config = vision_config
-        if isinstance(self.text_config, dict):
+        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-            self.vocab_size = self.text_config.vocab_size
        elif text_config is None:
-            self.text_config = CONFIG_MAPPING["llama"]()
+            text_config = CONFIG_MAPPING["llama"]()
+        self.text_config = text_config
+        self._vocab_size = self.text_config.vocab_size
        super().__init__(**kwargs)
+    @property
+    def vocab_size(self):
+        warnings.warn(
+            "The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
+            FutureWarning,
+        )
+        return self._vocab_size
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_vocab_size", None)
+        return output
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -216,6 +216,11 @@ LLAVA_INPUTS_DOCSTRING = r"""
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
@@ -240,7 +245,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
        self.vision_tower = AutoModel.from_config(config.vision_config)
        self.multi_modal_projector = LlavaMultiModalProjector(config)
-        self.vocab_size = config.vocab_size
+        self.vocab_size = config.text_config.vocab_size
        self.language_model = AutoModelForCausalLM.from_config(
            config.text_config, attn_implementation=config._attn_implementation
        )
@@ -272,7 +277,6 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.config.vocab_size = model_embeds.num_embeddings
        self.vocab_size = model_embeds.num_embeddings
        return model_embeds
@@ -433,38 +437,38 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
                )
                if labels is None:
                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
-            else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                    # Get the target length
-                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-                    # Filter out only the tokens that can be un-attended, this can happen
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                    # if one uses Llava + Fused modules where the cache on the
+            # generation with cache
-                    # first iteration is already big enough, or if one passes custom cache
+            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    new_batch_index = batch_index[valid_indices]
+                # that are set to 0
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                # Get the target length
+                target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                    # Zero-out the places where we don't need to attend
+                # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
        outputs = self.language_model(
            attention_mask=attention_mask,

--- a/src/transformers/models/llava_next/__init__.py
+++ b/src/transformers/models/llava_next/__init__.py
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+_import_structure = {
+    "configuration_llava_next": ["LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaNextConfig"],
+    "processing_llava_next": ["LlavaNextProcessor"],
+}
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_llava_next"] = [
+        "LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "LlavaNextForConditionalGeneration",
+        "LlavaNextPreTrainedModel",
+    ]
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_llava_next"] = ["LlavaNextImageProcessor"]
+if TYPE_CHECKING:
+    from .configuration_llava_next import LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaNextConfig
+    from .processing_llava_next import LlavaNextProcessor
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_llava_next import (
+            LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            LlavaNextForConditionalGeneration,
+            LlavaNextPreTrainedModel,
+        )
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_llava_next import LlavaNextImageProcessor
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Llava-NeXT model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "llava-hf/llava-v1.6-mistral-7b-hf": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/resolve/main/config.json",
+}
+class LlavaNextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
+    Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+    model.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
+            A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+    Example:
+    ```python
+    >>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+    >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> configuration = LlavaNextConfig(vision_config, text_config)
+    >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> model = LlavaNextForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llava_next"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints
+            if image_grid_pinpoints is not None
+            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+        )
+        self.image_grid_pinpoints = image_grid_pinpoints
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vision_config = vision_config
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+        self.text_config = text_config
+        super().__init__(**kwargs)
--- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
+++ b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository.
+URL: https://github.com/haotian-liu/LLaVA/tree/main.
+The command used to obtain original logits is the following:
+python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0
+Note: logits are tested with torch==2.1.2.
+"""
+import argparse
+import glob
+import json
+from pathlib import Path
+import requests
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
+from PIL import Image
+from safetensors import safe_open
+from transformers import (
+    AddedToken,
+    AutoConfig,
+    AutoTokenizer,
+    LlavaNextConfig,
+    LlavaNextForConditionalGeneration,
+    LlavaNextImageProcessor,
+    LlavaNextProcessor,
+)
+KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_tower.": "",
+    "model.mm_projector": "multi_modal_projector",
+    "model": "model.model",
+    "vision_model.model": "vision_model",
+    "lm_head": "language_model.lm_head",
+    "model.model": "language_model.model",
+    "multi_modal_projector.0": "multi_modal_projector.linear_1",
+    "multi_modal_projector.2": "multi_modal_projector.linear_2",
+    "language_model.model.image_newline": "image_newline",
+}
+def load_original_state_dict(model_id):
+    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
+    original_state_dict = {}
+    for path in glob.glob(f"{directory_path}/*"):
+        if path.endswith(".safetensors"):
+            with safe_open(path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    original_state_dict[key] = f.get_tensor(key)
+    return original_state_dict
+def convert_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if key.endswith(".inv_freq"):
+            continue
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        new_state_dict[key] = value.to(torch.float16)
+    return new_state_dict
+def load_image():
+    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
+    # load original config
+    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
+    # read json
+    with open(filepath) as f:
+        data = json.load(f)
+        print(data)
+    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+        image_token_index = 32000
+    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
+        text_model_id = "lmsys/vicuna-7b-v1.5"
+        image_token_index = 32000
+    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
+        text_model_id = "lmsys/vicuna-13b-v1.5"
+        image_token_index = 32000
+    elif model_id == "liuhaotian/llava-v1.6-34b":
+        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
+        image_token_index = 64000
+    vision_model_id = data["mm_vision_tower"]
+    torch.set_default_dtype(torch.float16)
+    text_config = AutoConfig.from_pretrained(text_model_id)
+    use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True
+    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast)
+    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
+    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+        # Mistral-7B doesn't have a padding token set yet
+        tokenizer.add_special_tokens({"pad_token": "<pad>"})
+    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
+    processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor)
+    config = LlavaNextConfig(
+        text_config=text_config.to_dict(),
+        image_grid_pinpoints=image_processor.image_grid_pinpoints,
+        use_image_newline_parameter=True,
+        image_token_index=image_token_index,
+    )
+    with init_empty_weights():
+        model = LlavaNextForConditionalGeneration(config)
+    # load original state dict
+    state_dict = load_original_state_dict(model_id)
+    state_dict = convert_state_dict_to_hf(state_dict)
+    model.load_state_dict(state_dict, assign=True)
+    model.eval()
+    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
+    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
+    n = pre_expansion_embeddings.size()[0]
+    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
+    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
+    # We add an image token so we resize the model
+    # Pad to 64 for performance reasons
+    pad_shape = 64
+    vocab_size = config.text_config.vocab_size
+    if model_id == "liuhaotian/llava-v1.6-34b":
+        # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and <image>
+        num_tokens = vocab_size + 3
+    else:
+        # this one has 2 additional tokens, namely <image> and <pad>
+        num_tokens = vocab_size + 2
+    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
+    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
+        tuple(
+            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
+        ),
+        dim=0,
+    )
+    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
+        dim=0,
+    )
+    device = "cuda:2"
+    model.to(device)
+    # prepare inputs
+    image = load_image()
+    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+        prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
+    elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]:
+        prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
+    elif model_id == "liuhaotian/llava-v1.6-34b":
+        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
+    inputs = processor(images=image, text=prompt, return_tensors="pt")
+    # verify inputs
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset")
+    original_pixel_values = torch.load(filepath, map_location="cpu")
+    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
+    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset")
+        original_input_ids = torch.load(filepath, map_location="cpu")
+        # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
+        original_input_ids[original_input_ids == -200] = image_token_index
+        print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200]))
+        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
+    elif model_id == "liuhaotian/llava-v1.6-34b":
+        filepath = hf_hub_download(
+            repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset"
+        )
+        original_input_ids = torch.load(filepath, map_location="cpu")
+        # replace -200 by image_token_index
+        original_input_ids[original_input_ids == -200] = image_token_index
+        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
+    image_sizes = torch.tensor([[899, 1024]])
+    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
+    # verify single forward pass
+    print("Single forward pass")
+    with torch.inference_mode():
+        inputs = inputs.to(device)
+        outputs = model(**inputs)
+        print("Shape of logits:", outputs.logits.shape)
+        print("First values of logits:", outputs.logits[0, :3, :3])
+        if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+            expected_slice = torch.tensor(
+                [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]],
+                dtype=torch.float32,
+                device=device,
+            )
+        elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
+            expected_slice = torch.tensor(
+                [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]],
+                dtype=torch.float32,
+                device=device,
+            )
+        elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
+            expected_slice = torch.tensor(
+                [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]],
+                dtype=torch.float32,
+                device=device,
+            )
+        elif model_id == "liuhaotian/llava-v1.6-34b":
+            expected_slice = torch.tensor(
+                [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]],
+                dtype=torch.float32,
+                device=device,
+            )
+        else:
+            raise ValueError(f"Model {model_id} not supported")
+        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
+        print("Logits are ok!")
+    # verify generation
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=100,
+        use_cache=True,
+    )
+    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    print("Generated text:", repr(generated_text))
+    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
+        expected_text = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "'
+    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
+        expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V"""
+    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
+        expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM"
+    elif model_id == "liuhaotian/llava-v1.6-34b":
+        expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-"
+    else:
+        raise ValueError(f"Model {model_id} not supported")
+    assert generated_text == expected_text
+    print("Generated text is ok!")
+    # verify batched generation
+    print("Batched generation...")
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    cats_image = Image.open(requests.get(url, stream=True).raw)
+    inputs = processor(
+        images=[image, cats_image],
+        text=[prompt, "[INST] <image>\nHow many cats are there? [/INST]"],
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    for k, v in inputs.items():
+        print(k, v.shape)
+    print("Image sizes:", inputs.image_sizes)
+    # make sure image_sizes are the same
+    # as otherwise batched generation doesn't work
+    inputs.image_sizes[1] = inputs.image_sizes[0]
+    print("Batched generation...")
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=20,
+        use_cache=True,
+    )
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    print(outputs)
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        repo_id = model_id.split("/")[-1]
+        model.push_to_hub(f"llava-hf/{repo_id}-hf")
+        processor.push_to_hub(f"llava-hf/{repo_id}-hf")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id",
+        help="Hub location of the model to convert",
+        default="liuhaotian/llava-v1.6-mistral-7b",
+        choices=[
+            "liuhaotian/llava-v1.6-mistral-7b",
+            "liuhaotian/llava-v1.6-vicuna-7b",
+            "liuhaotian/llava-v1.6-vicuna-13b",
+            "liuhaotian/llava-v1.6-34b",
+        ],
+        required=False,
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LLaVa-NeXT.
+"""
+from typing import List, Optional, Union
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+class LlavaNextProcessor(ProcessorMixin):
+    r"""
+    Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
+    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
+    Args:
+        image_processor ([`LlavaNextImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LlavaNextImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None):
+        super().__init__(image_processor, tokenizer)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+        else:
+            image_inputs = {}
+        text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+        )
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """ VipLlava model configuration"""
+import warnings
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING
@@ -51,9 +53,6 @@ class VipLlavaConfig(PretrainedConfig):
            The layer norm epsilon of the projector layernorm
        vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`):
            The list of layers to select the vision features from.
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the VipLlava model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~VipLlavaForConditionalGeneration`]
    Example:
@@ -88,7 +87,6 @@ class VipLlavaConfig(PretrainedConfig):
        projector_hidden_act="gelu",
        projector_layernorm_eps=1e-5,
        vision_feature_layers=[-2, -5, -8, -11, 6],
-        vocab_size=32000,
        **kwargs,
    ):
        self.ignore_index = ignore_index
@@ -96,7 +94,12 @@ class VipLlavaConfig(PretrainedConfig):
        self.projector_hidden_act = projector_hidden_act
        self.projector_layernorm_eps = projector_layernorm_eps
        self.vision_feature_layers = vision_feature_layers
-        self.vocab_size = vocab_size
+        if "vocab_size" in kwargs:
+            warnings.warn(
+                "The `vocab_size` argument is deprecated and will be removed in v4.42, since it can be inferred from the `text_config`. Passing this argument has no effect",
+                FutureWarning,
+            )
        self.vision_config = vision_config
@@ -116,15 +119,27 @@ class VipLlavaConfig(PretrainedConfig):
                vocab_size=32000,
                projection_dim=768,
            )
-        self.vocab_size = self.vocab_size
-        self.text_config = text_config
-        if isinstance(self.text_config, dict):
+        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-            self.vocab_size = self.text_config.vocab_size
        elif text_config is None:
-            self.text_config = CONFIG_MAPPING["llama"]()
+            text_config = CONFIG_MAPPING["llama"]()
+        self.text_config = text_config
+        self._vocab_size = self.text_config.vocab_size
        super().__init__(**kwargs)
+        @property
+        def vocab_size(self):
+            warnings.warn(
+                "The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
+                FutureWarning,
+            )
+            return self._vocab_size
+        def to_dict(self):
+            output = super().to_dict()
+            output.pop("_vocab_size", None)
+            return output
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -248,7 +248,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
        self.vision_tower = AutoModel.from_config(config.vision_config)
        self.multi_modal_projector = VipLlavaMultiModalProjector(config)
-        self.vocab_size = config.vocab_size
+        self.vocab_size = config.text_config.vocab_size
        self.language_model = AutoModelForCausalLM.from_config(
            config.text_config, attn_implementation=config._attn_implementation
        )
@@ -280,7 +280,6 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.config.vocab_size = model_embeds.num_embeddings
        self.vocab_size = model_embeds.num_embeddings
        return model_embeds

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4806,7 +4806,17 @@ class LlavaPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
-class LlavaProcessor(metaclass=DummyObject):
+LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class LlavaNextForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class LlavaNextPreTrainedModel(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):

--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -310,6 +310,13 @@ class LevitImageProcessor(metaclass=DummyObject):
        requires_backends(self, ["vision"])
+class LlavaNextImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
 class Mask2FormerImageProcessor(metaclass=DummyObject):
    _backends = ["vision"]

--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Llava model. """
+import copy
 import gc
 import unittest
@@ -76,7 +77,6 @@ class LlavaVisionText2TextModelTester:
        },
        is_training=True,
        vision_config={
-            "batch_size": 12,
            "image_size": 30,
            "patch_size": 2,
            "num_channels": 3,
@@ -159,9 +159,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
    all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
-    fx_compatible = False
    test_pruning = False
-    test_resize_embeddings = True
    test_head_masking = False
    def setUp(self):
@@ -186,6 +184,171 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass
+    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings with config.vocab_size->config.text_config.vocab_size
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+            if self.model_tester.is_training is False:
+                model.eval()
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # make sure that decoder_input_ids are resized as well
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+            self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
+            self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_embeddings_untied with config.vocab_size->config.text_config.vocab_size
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+        original_config.tie_word_embeddings = False
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+    # Copied from tests.test_modeling_common.ModelTesterMixin.test_tie_model_weights with config.vocab_size->config.text_config.vocab_size
+    def test_tie_model_weights(self):
+        if not self.test_torchscript:
+            return
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        def check_same_values(layer_1, layer_2):
+            equal = True
+            for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    equal = False
+            return equal
+        for model_class in self.all_model_classes:
+            config.torchscript = True
+            model_not_tied = model_class(config)
+            if model_not_tied.get_output_embeddings() is None:
+                continue
+            config_tied = copy.deepcopy(config)
+            config_tied.torchscript = False
+            model_tied = model_class(config_tied)
+            params_tied = list(model_tied.parameters())
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(check_same_values(embeddings, decoding))
+            # Check that after resize they remain tied.
+            model_tied.resize_token_embeddings(config.text_config.vocab_size + 10)
+            params_tied_2 = list(model_tied.parameters())
+            self.assertEqual(len(params_tied_2), len(params_tied))
 @require_torch
 class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):

--- a/tests/models/llava_next/__init__.py
+++ b/tests/models/llava_next/__init__.py
--- a/tests/models/llava_next/test_image_processor_llava_next.py
+++ b/tests/models/llava_next/test_image_processor_llava_next.py
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+if is_torch_available():
+    import torch
+if is_vision_available():
+    from PIL import Image
+    from transformers import LlavaNextImageProcessor
+class LlavaNextImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+@require_torch
+@require_vision
+class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = LlavaNextImageProcessor if is_vision_available() else None
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
+    def setUp(self):
+        self.image_processor_tester = LlavaNextImageProcessingTester(self)
+    @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+        self.assertTrue(hasattr(image_processing, "image_grid_pinpoints"))
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+    def test_select_best_resolution(self):
+        possible_resolutions = [[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]
+        # Test with a square aspect ratio
+        best_resolution = select_best_resolution((336, 336), possible_resolutions)
+        self.assertEqual(best_resolution, (672, 336))
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    @unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass