Unverified Commit d91fd7f9 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add LLaVa-1.6, bis (#29586)



* First draft

* Fix tests, add docs

* Improve docstrings

* Fix test

* Address comments

* Address comments

* Remove vocab_size attribute

* Remove batch_size

* Address comment

* Add image processor tests

* Support fx

* Update docstring

* Add support for 34b

* Convert 34b model

* Add integration tests

* Update checkpoints

* Convert vicuna-13b, remove doc tests

* Remove script

* Remove file

* Address comments

* Improve docstrings

* Deprecate vocab_size

* Remove aspect_ratio_setting

* Address comments

* Update READMEs

* Add tips about chat templates

* Fix tests

* Deprecate vocab_size safely

* Update tests

---------
Co-authored-by: default avatarAmy Roberts <22614925+amyeroberts@users.noreply.github.com>
parent 9d999481
...@@ -77,6 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ...@@ -77,6 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv3", "LayoutLMv3ImageProcessor"), ("layoutlmv3", "LayoutLMv3ImageProcessor"),
("levit", "LevitImageProcessor"), ("levit", "LevitImageProcessor"),
("llava", "CLIPImageProcessor"), ("llava", "CLIPImageProcessor"),
("llava_next", "CLIPImageProcessor"),
("mask2former", "Mask2FormerImageProcessor"), ("mask2former", "Mask2FormerImageProcessor"),
("maskformer", "MaskFormerImageProcessor"), ("maskformer", "MaskFormerImageProcessor"),
("mgp-str", "ViTImageProcessor"), ("mgp-str", "ViTImageProcessor"),
......
...@@ -286,6 +286,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( ...@@ -286,6 +286,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("idefics", "IdeficsForVisionText2Text"), ("idefics", "IdeficsForVisionText2Text"),
("layoutlm", "LayoutLMForMaskedLM"), ("layoutlm", "LayoutLMForMaskedLM"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("longformer", "LongformerForMaskedLM"), ("longformer", "LongformerForMaskedLM"),
("luke", "LukeForMaskedLM"), ("luke", "LukeForMaskedLM"),
("lxmert", "LxmertForPreTraining"), ("lxmert", "LxmertForPreTraining"),
...@@ -675,6 +676,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( ...@@ -675,6 +676,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("instructblip", "InstructBlipForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"),
("vipllava", "VipLlavaForConditionalGeneration"), ("vipllava", "VipLlavaForConditionalGeneration"),
("vision-encoder-decoder", "VisionEncoderDecoderModel"), ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
......
...@@ -66,6 +66,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ...@@ -66,6 +66,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"), ("layoutlmv3", "LayoutLMv3Processor"),
("llava", "LlavaProcessor"), ("llava", "LlavaProcessor"),
("llava_next", "LlavaNextProcessor"),
("markuplm", "MarkupLMProcessor"), ("markuplm", "MarkupLMProcessor"),
("mctct", "MCTCTProcessor"), ("mctct", "MCTCTProcessor"),
("mgp-str", "MgpstrProcessor"), ("mgp-str", "MgpstrProcessor"),
......
...@@ -223,6 +223,7 @@ else: ...@@ -223,6 +223,7 @@ else:
), ),
), ),
("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
( (
"longt5", "longt5",
......
...@@ -16,7 +16,10 @@ from typing import TYPE_CHECKING ...@@ -16,7 +16,10 @@ from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"]} _import_structure = {
"configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"],
"processing_llava": ["LlavaProcessor"],
}
try: try:
...@@ -30,11 +33,11 @@ else: ...@@ -30,11 +33,11 @@ else:
"LlavaForConditionalGeneration", "LlavaForConditionalGeneration",
"LlavaPreTrainedModel", "LlavaPreTrainedModel",
] ]
_import_structure["processing_llava"] = ["LlavaProcessor"]
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_llava import LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaConfig from .configuration_llava import LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaConfig
from .processing_llava import LlavaProcessor
try: try:
if not is_torch_available(): if not is_torch_available():
...@@ -47,8 +50,6 @@ if TYPE_CHECKING: ...@@ -47,8 +50,6 @@ if TYPE_CHECKING:
LlavaForConditionalGeneration, LlavaForConditionalGeneration,
LlavaPreTrainedModel, LlavaPreTrainedModel,
) )
from .processing_llava import LlavaProcessor
else: else:
import sys import sys
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
""" Llava model configuration""" """ Llava model configuration"""
import warnings
from ...configuration_utils import PretrainedConfig from ...configuration_utils import PretrainedConfig
from ...utils import logging from ...utils import logging
from ..auto import CONFIG_MAPPING from ..auto import CONFIG_MAPPING
...@@ -37,10 +39,10 @@ class LlavaConfig(PretrainedConfig): ...@@ -37,10 +39,10 @@ class LlavaConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information. documentation from [`PretrainedConfig`] for more information.
Args: Args:
vision_config (`LlavaVisionConfig`, *optional*): vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
Custom vision config or dict The config object or dictionary of the vision backbone.
text_config (`Union[AutoConfig, dict]`, *optional*): text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. The config object or dictionary of the text backbone.
ignore_index (`int`, *optional*, defaults to -100): ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function. The ignore index for the loss function.
image_token_index (`int`, *optional*, defaults to 32000): image_token_index (`int`, *optional*, defaults to 32000):
...@@ -48,12 +50,10 @@ class LlavaConfig(PretrainedConfig): ...@@ -48,12 +50,10 @@ class LlavaConfig(PretrainedConfig):
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector. The activation function used by the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the CLIP backbone. The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`.
vision_feature_layer (`int`, *optional*, defaults to -2): vision_feature_layer (`int`, *optional*, defaults to -2):
The index of the layer to select the vision feature. The index of the layer to select the vision feature.
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
Example: Example:
...@@ -88,25 +88,34 @@ class LlavaConfig(PretrainedConfig): ...@@ -88,25 +88,34 @@ class LlavaConfig(PretrainedConfig):
projector_hidden_act="gelu", projector_hidden_act="gelu",
vision_feature_select_strategy="default", vision_feature_select_strategy="default",
vision_feature_layer=-2, vision_feature_layer=-2,
vocab_size=32000,
**kwargs, **kwargs,
): ):
self.ignore_index = ignore_index self.ignore_index = ignore_index
self.image_token_index = image_token_index self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act self.projector_hidden_act = projector_hidden_act
if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
"vision_feature_select_strategy should be one of 'default', 'full'."
f"Got: {vision_feature_select_strategy}"
)
if "vocab_size" in kwargs:
warnings.warn(
"The `vocab_size` argument is deprecated and will be removed in v4.42, since it can be inferred from the `text_config`. Passing this argument has no effect",
FutureWarning,
)
self.vision_feature_select_strategy = vision_feature_select_strategy self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer self.vision_feature_layer = vision_feature_layer
self.vocab_size = vocab_size
self.vision_config = vision_config if isinstance(vision_config, dict):
if isinstance(self.vision_config, dict):
vision_config["model_type"] = ( vision_config["model_type"] = (
vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model" vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
) )
self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
elif vision_config is None: elif vision_config is None:
self.vision_config = CONFIG_MAPPING["clip_vision_model"]( vision_config = CONFIG_MAPPING["clip_vision_model"](
intermediate_size=4096, intermediate_size=4096,
hidden_size=1024, hidden_size=1024,
patch_size=14, patch_size=14,
...@@ -116,15 +125,29 @@ class LlavaConfig(PretrainedConfig): ...@@ -116,15 +125,29 @@ class LlavaConfig(PretrainedConfig):
vocab_size=32000, vocab_size=32000,
projection_dim=768, projection_dim=768,
) )
self.vocab_size = self.vocab_size
self.text_config = text_config self.vision_config = vision_config
if isinstance(self.text_config, dict): if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
self.vocab_size = self.text_config.vocab_size
elif text_config is None: elif text_config is None:
self.text_config = CONFIG_MAPPING["llama"]() text_config = CONFIG_MAPPING["llama"]()
self.text_config = text_config
self._vocab_size = self.text_config.vocab_size
super().__init__(**kwargs) super().__init__(**kwargs)
@property
def vocab_size(self):
warnings.warn(
"The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
FutureWarning,
)
return self._vocab_size
def to_dict(self):
output = super().to_dict()
output.pop("_vocab_size", None)
return output
...@@ -216,6 +216,11 @@ LLAVA_INPUTS_DOCSTRING = r""" ...@@ -216,6 +216,11 @@ LLAVA_INPUTS_DOCSTRING = r"""
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix. model's internal embedding lookup matrix.
vision_feature_layer (`int`, *optional*, defaults to -2):
The index of the layer to select the vision feature.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`.
use_cache (`bool`, *optional*): use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`). `past_key_values`).
...@@ -240,7 +245,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel): ...@@ -240,7 +245,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
self.vision_tower = AutoModel.from_config(config.vision_config) self.vision_tower = AutoModel.from_config(config.vision_config)
self.multi_modal_projector = LlavaMultiModalProjector(config) self.multi_modal_projector = LlavaMultiModalProjector(config)
self.vocab_size = config.vocab_size self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config( self.language_model = AutoModelForCausalLM.from_config(
config.text_config, attn_implementation=config._attn_implementation config.text_config, attn_implementation=config._attn_implementation
) )
...@@ -272,7 +277,6 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel): ...@@ -272,7 +277,6 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
# update vocab size # update vocab size
self.config.text_config.vocab_size = model_embeds.num_embeddings self.config.text_config.vocab_size = model_embeds.num_embeddings
self.config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings self.vocab_size = model_embeds.num_embeddings
return model_embeds return model_embeds
...@@ -433,38 +437,38 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel): ...@@ -433,38 +437,38 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
) )
if labels is None: if labels is None:
labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long) labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
else:
# In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
# generation with cache
if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
# Retrieve the first layer to inspect the logits and mask out the hidden states
# that are set to 0
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
# Get the target length
target_seqlen = first_layer_past_key_value.shape[-1] + 1
extended_attention_mask = torch.ones(
(attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
dtype=attention_mask.dtype,
device=attention_mask.device,
)
# Filter out only the tokens that can be un-attended, this can happen # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
# if one uses Llava + Fused modules where the cache on the # generation with cache
# first iteration is already big enough, or if one passes custom cache elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
valid_indices = non_attended_tokens < extended_attention_mask.size(-1) # Retrieve the first layer to inspect the logits and mask out the hidden states
new_batch_index = batch_index[valid_indices] # that are set to 0
new_non_attended_tokens = non_attended_tokens[valid_indices] first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
# Get the target length
target_seqlen = first_layer_past_key_value.shape[-1] + 1
extended_attention_mask = torch.ones(
(attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
dtype=attention_mask.dtype,
device=attention_mask.device,
)
# Filter out only the tokens that can be un-attended, this can happen
# if one uses Llava + Fused modules where the cache on the
# first iteration is already big enough, or if one passes custom cache
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
new_batch_index = batch_index[valid_indices]
new_non_attended_tokens = non_attended_tokens[valid_indices]
# Zero-out the places where we don't need to attend # Zero-out the places where we don't need to attend
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1) attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
outputs = self.language_model( outputs = self.language_model(
attention_mask=attention_mask, attention_mask=attention_mask,
......
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_llava_next": ["LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaNextConfig"],
"processing_llava_next": ["LlavaNextProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_llava_next"] = [
"LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"LlavaNextForConditionalGeneration",
"LlavaNextPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_llava_next"] = ["LlavaNextImageProcessor"]
if TYPE_CHECKING:
from .configuration_llava_next import LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaNextConfig
from .processing_llava_next import LlavaNextProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_llava_next import (
LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
LlavaNextForConditionalGeneration,
LlavaNextPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_llava_next import LlavaNextImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Llava-NeXT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"llava-hf/llava-v1.6-mistral-7b-hf": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/resolve/main/config.json",
}
class LlavaNextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
model.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
The config object or dictionary of the vision backbone.
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
The config object or dictionary of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
image_token_index (`int`, *optional*, defaults to 32000):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
If `"full"`, the full vision features are used.
vision_feature_layer (`int`, *optional*, defaults to -2):
The index of the layer to select the vision feature.
image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
of the form `(height, width)`.
Example:
```python
>>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
>>> # Initializing a CLIP-vision config
>>> vision_config = CLIPVisionConfig()
>>> # Initializing a Llama config
>>> text_config = LlamaConfig()
>>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> configuration = LlavaNextConfig(vision_config, text_config)
>>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> model = LlavaNextForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "llava_next"
is_composition = False
def __init__(
self,
vision_config=None,
text_config=None,
ignore_index=-100,
image_token_index=32000,
projector_hidden_act="gelu",
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_grid_pinpoints=None,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
"vision_feature_select_strategy should be one of 'default', 'full'."
f"Got: {vision_feature_select_strategy}"
)
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
image_grid_pinpoints = (
image_grid_pinpoints
if image_grid_pinpoints is not None
else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
)
self.image_grid_pinpoints = image_grid_pinpoints
if isinstance(vision_config, dict):
vision_config["model_type"] = (
vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
)
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
elif vision_config is None:
vision_config = CONFIG_MAPPING["clip_vision_model"](
intermediate_size=4096,
hidden_size=1024,
patch_size=14,
image_size=336,
num_hidden_layers=24,
num_attention_heads=16,
vocab_size=32000,
projection_dim=768,
)
self.vision_config = vision_config
if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["llama"]()
self.text_config = text_config
super().__init__(**kwargs)
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository.
URL: https://github.com/haotian-liu/LLaVA/tree/main.
The command used to obtain original logits is the following:
python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0
Note: logits are tested with torch==2.1.2.
"""
import argparse
import glob
import json
from pathlib import Path
import requests
import torch
from accelerate import init_empty_weights
from huggingface_hub import hf_hub_download, snapshot_download
from PIL import Image
from safetensors import safe_open
from transformers import (
AddedToken,
AutoConfig,
AutoTokenizer,
LlavaNextConfig,
LlavaNextForConditionalGeneration,
LlavaNextImageProcessor,
LlavaNextProcessor,
)
KEYS_TO_MODIFY_MAPPING = {
"model.vision_tower.": "",
"model.mm_projector": "multi_modal_projector",
"model": "model.model",
"vision_model.model": "vision_model",
"lm_head": "language_model.lm_head",
"model.model": "language_model.model",
"multi_modal_projector.0": "multi_modal_projector.linear_1",
"multi_modal_projector.2": "multi_modal_projector.linear_2",
"language_model.model.image_newline": "image_newline",
}
def load_original_state_dict(model_id):
directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
original_state_dict = {}
for path in glob.glob(f"{directory_path}/*"):
if path.endswith(".safetensors"):
with safe_open(path, framework="pt", device="cpu") as f:
for key in f.keys():
original_state_dict[key] = f.get_tensor(key)
return original_state_dict
def convert_state_dict_to_hf(state_dict):
new_state_dict = {}
for key, value in state_dict.items():
if key.endswith(".inv_freq"):
continue
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
new_state_dict[key] = value.to(torch.float16)
return new_state_dict
def load_image():
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
return image
def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
# load original config
filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
# read json
with open(filepath) as f:
data = json.load(f)
print(data)
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
image_token_index = 32000
elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
text_model_id = "lmsys/vicuna-7b-v1.5"
image_token_index = 32000
elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
text_model_id = "lmsys/vicuna-13b-v1.5"
image_token_index = 32000
elif model_id == "liuhaotian/llava-v1.6-34b":
text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
image_token_index = 64000
vision_model_id = data["mm_vision_tower"]
torch.set_default_dtype(torch.float16)
text_config = AutoConfig.from_pretrained(text_model_id)
use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True
tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast)
tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
# Mistral-7B doesn't have a padding token set yet
tokenizer.add_special_tokens({"pad_token": "<pad>"})
image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor)
config = LlavaNextConfig(
text_config=text_config.to_dict(),
image_grid_pinpoints=image_processor.image_grid_pinpoints,
use_image_newline_parameter=True,
image_token_index=image_token_index,
)
with init_empty_weights():
model = LlavaNextForConditionalGeneration(config)
# load original state dict
state_dict = load_original_state_dict(model_id)
state_dict = convert_state_dict_to_hf(state_dict)
model.load_state_dict(state_dict, assign=True)
model.eval()
pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
mu = torch.mean(pre_expansion_embeddings, dim=0).float()
n = pre_expansion_embeddings.size()[0]
sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
# We add an image token so we resize the model
# Pad to 64 for performance reasons
pad_shape = 64
vocab_size = config.text_config.vocab_size
if model_id == "liuhaotian/llava-v1.6-34b":
# this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and <image>
num_tokens = vocab_size + 3
else:
# this one has 2 additional tokens, namely <image> and <pad>
num_tokens = vocab_size + 2
model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
tuple(
(dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
),
dim=0,
)
model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
dim=0,
)
device = "cuda:2"
model.to(device)
# prepare inputs
image = load_image()
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]:
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
elif model_id == "liuhaotian/llava-v1.6-34b":
prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
inputs = processor(images=image, text=prompt, return_tensors="pt")
# verify inputs
filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset")
original_pixel_values = torch.load(filepath, map_location="cpu")
assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset")
original_input_ids = torch.load(filepath, map_location="cpu")
# replace -200 by image_token_index (since we use token ID = 32000 for the image token)
original_input_ids[original_input_ids == -200] = image_token_index
print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200]))
assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
elif model_id == "liuhaotian/llava-v1.6-34b":
filepath = hf_hub_download(
repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset"
)
original_input_ids = torch.load(filepath, map_location="cpu")
# replace -200 by image_token_index
original_input_ids[original_input_ids == -200] = image_token_index
assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
image_sizes = torch.tensor([[899, 1024]])
assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
# verify single forward pass
print("Single forward pass")
with torch.inference_mode():
inputs = inputs.to(device)
outputs = model(**inputs)
print("Shape of logits:", outputs.logits.shape)
print("First values of logits:", outputs.logits[0, :3, :3])
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
expected_slice = torch.tensor(
[[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]],
dtype=torch.float32,
device=device,
)
elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
expected_slice = torch.tensor(
[[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]],
dtype=torch.float32,
device=device,
)
elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
expected_slice = torch.tensor(
[[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]],
dtype=torch.float32,
device=device,
)
elif model_id == "liuhaotian/llava-v1.6-34b":
expected_slice = torch.tensor(
[[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]],
dtype=torch.float32,
device=device,
)
else:
raise ValueError(f"Model {model_id} not supported")
assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
print("Logits are ok!")
# verify generation
output_ids = model.generate(
**inputs,
max_new_tokens=100,
use_cache=True,
)
generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print("Generated text:", repr(generated_text))
if model_id == "liuhaotian/llava-v1.6-mistral-7b":
expected_text = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "'
elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V"""
elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM"
elif model_id == "liuhaotian/llava-v1.6-34b":
expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-"
else:
raise ValueError(f"Model {model_id} not supported")
assert generated_text == expected_text
print("Generated text is ok!")
# verify batched generation
print("Batched generation...")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
cats_image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(
images=[image, cats_image],
text=[prompt, "[INST] <image>\nHow many cats are there? [/INST]"],
padding=True,
return_tensors="pt",
).to(device)
for k, v in inputs.items():
print(k, v.shape)
print("Image sizes:", inputs.image_sizes)
# make sure image_sizes are the same
# as otherwise batched generation doesn't work
inputs.image_sizes[1] = inputs.image_sizes[0]
print("Batched generation...")
output_ids = model.generate(
**inputs,
max_new_tokens=20,
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print(outputs)
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
repo_id = model_id.split("/")[-1]
model.push_to_hub(f"llava-hf/{repo_id}-hf")
processor.push_to_hub(f"llava-hf/{repo_id}-hf")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_id",
help="Hub location of the model to convert",
default="liuhaotian/llava-v1.6-mistral-7b",
choices=[
"liuhaotian/llava-v1.6-mistral-7b",
"liuhaotian/llava-v1.6-vicuna-7b",
"liuhaotian/llava-v1.6-vicuna-13b",
"liuhaotian/llava-v1.6-34b",
],
required=False,
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
This diff is collapsed.
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LLaVa-NeXT.
"""
from typing import List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LlavaNextProcessor(ProcessorMixin):
r"""
Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
[`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
Args:
image_processor ([`LlavaNextImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`LlamaTokenizerFast`], *optional*):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LlavaNextImageProcessor"
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None):
super().__init__(image_processor, tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
images: ImageInput = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length=None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if images is not None:
image_inputs = self.image_processor(images, return_tensors=return_tensors)
else:
image_inputs = {}
text_inputs = self.tokenizer(
text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
)
return BatchFeature(data={**text_inputs, **image_inputs})
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
""" VipLlava model configuration""" """ VipLlava model configuration"""
import warnings
from ...configuration_utils import PretrainedConfig from ...configuration_utils import PretrainedConfig
from ...utils import logging from ...utils import logging
from ..auto import CONFIG_MAPPING from ..auto import CONFIG_MAPPING
...@@ -51,9 +53,6 @@ class VipLlavaConfig(PretrainedConfig): ...@@ -51,9 +53,6 @@ class VipLlavaConfig(PretrainedConfig):
The layer norm epsilon of the projector layernorm The layer norm epsilon of the projector layernorm
vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`): vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`):
The list of layers to select the vision features from. The list of layers to select the vision features from.
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the VipLlava model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~VipLlavaForConditionalGeneration`]
Example: Example:
...@@ -88,7 +87,6 @@ class VipLlavaConfig(PretrainedConfig): ...@@ -88,7 +87,6 @@ class VipLlavaConfig(PretrainedConfig):
projector_hidden_act="gelu", projector_hidden_act="gelu",
projector_layernorm_eps=1e-5, projector_layernorm_eps=1e-5,
vision_feature_layers=[-2, -5, -8, -11, 6], vision_feature_layers=[-2, -5, -8, -11, 6],
vocab_size=32000,
**kwargs, **kwargs,
): ):
self.ignore_index = ignore_index self.ignore_index = ignore_index
...@@ -96,7 +94,12 @@ class VipLlavaConfig(PretrainedConfig): ...@@ -96,7 +94,12 @@ class VipLlavaConfig(PretrainedConfig):
self.projector_hidden_act = projector_hidden_act self.projector_hidden_act = projector_hidden_act
self.projector_layernorm_eps = projector_layernorm_eps self.projector_layernorm_eps = projector_layernorm_eps
self.vision_feature_layers = vision_feature_layers self.vision_feature_layers = vision_feature_layers
self.vocab_size = vocab_size
if "vocab_size" in kwargs:
warnings.warn(
"The `vocab_size` argument is deprecated and will be removed in v4.42, since it can be inferred from the `text_config`. Passing this argument has no effect",
FutureWarning,
)
self.vision_config = vision_config self.vision_config = vision_config
...@@ -116,15 +119,27 @@ class VipLlavaConfig(PretrainedConfig): ...@@ -116,15 +119,27 @@ class VipLlavaConfig(PretrainedConfig):
vocab_size=32000, vocab_size=32000,
projection_dim=768, projection_dim=768,
) )
self.vocab_size = self.vocab_size
self.text_config = text_config
if isinstance(self.text_config, dict): if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
self.vocab_size = self.text_config.vocab_size
elif text_config is None: elif text_config is None:
self.text_config = CONFIG_MAPPING["llama"]() text_config = CONFIG_MAPPING["llama"]()
self.text_config = text_config
self._vocab_size = self.text_config.vocab_size
super().__init__(**kwargs) super().__init__(**kwargs)
@property
def vocab_size(self):
warnings.warn(
"The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
FutureWarning,
)
return self._vocab_size
def to_dict(self):
output = super().to_dict()
output.pop("_vocab_size", None)
return output
...@@ -248,7 +248,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel): ...@@ -248,7 +248,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
self.vision_tower = AutoModel.from_config(config.vision_config) self.vision_tower = AutoModel.from_config(config.vision_config)
self.multi_modal_projector = VipLlavaMultiModalProjector(config) self.multi_modal_projector = VipLlavaMultiModalProjector(config)
self.vocab_size = config.vocab_size self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config( self.language_model = AutoModelForCausalLM.from_config(
config.text_config, attn_implementation=config._attn_implementation config.text_config, attn_implementation=config._attn_implementation
) )
...@@ -280,7 +280,6 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel): ...@@ -280,7 +280,6 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
# update vocab size # update vocab size
self.config.text_config.vocab_size = model_embeds.num_embeddings self.config.text_config.vocab_size = model_embeds.num_embeddings
self.config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings self.vocab_size = model_embeds.num_embeddings
return model_embeds return model_embeds
......
...@@ -4806,7 +4806,17 @@ class LlavaPreTrainedModel(metaclass=DummyObject): ...@@ -4806,7 +4806,17 @@ class LlavaPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
class LlavaProcessor(metaclass=DummyObject): LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class LlavaNextForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LlavaNextPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
......
...@@ -310,6 +310,13 @@ class LevitImageProcessor(metaclass=DummyObject): ...@@ -310,6 +310,13 @@ class LevitImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class LlavaNextImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class Mask2FormerImageProcessor(metaclass=DummyObject): class Mask2FormerImageProcessor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
""" Testing suite for the PyTorch Llava model. """ """ Testing suite for the PyTorch Llava model. """
import copy
import gc import gc
import unittest import unittest
...@@ -76,7 +77,6 @@ class LlavaVisionText2TextModelTester: ...@@ -76,7 +77,6 @@ class LlavaVisionText2TextModelTester:
}, },
is_training=True, is_training=True,
vision_config={ vision_config={
"batch_size": 12,
"image_size": 30, "image_size": 30,
"patch_size": 2, "patch_size": 2,
"num_channels": 3, "num_channels": 3,
...@@ -159,9 +159,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase ...@@ -159,9 +159,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else () all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {} pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
fx_compatible = False
test_pruning = False test_pruning = False
test_resize_embeddings = True
test_head_masking = False test_head_masking = False
def setUp(self): def setUp(self):
...@@ -186,6 +184,171 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase ...@@ -186,6 +184,171 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
def test_training_gradient_checkpointing_use_reentrant_false(self): def test_training_gradient_checkpointing_use_reentrant_false(self):
pass pass
# Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings with config.vocab_size->config.text_config.vocab_size
def test_resize_tokens_embeddings(self):
(
original_config,
inputs_dict,
) = self.model_tester.prepare_config_and_inputs_for_common()
if not self.test_resize_embeddings:
return
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
if self.model_tester.is_training is False:
model.eval()
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
# make sure that decoder_input_ids are resized as well
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_embeddings_untied with config.vocab_size->config.text_config.vocab_size
def test_resize_embeddings_untied(self):
(
original_config,
inputs_dict,
) = self.model_tester.prepare_config_and_inputs_for_common()
if not self.test_resize_embeddings:
return
original_config.tie_word_embeddings = False
# if model cannot untied embeddings -> leave test
if original_config.tie_word_embeddings:
return
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
# if no output embeddings -> leave test
if model.get_output_embeddings() is None:
continue
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Copied from tests.test_modeling_common.ModelTesterMixin.test_tie_model_weights with config.vocab_size->config.text_config.vocab_size
def test_tie_model_weights(self):
if not self.test_torchscript:
return
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_same_values(layer_1, layer_2):
equal = True
for p1, p2 in zip(layer_1.weight, layer_2.weight):
if p1.data.ne(p2.data).sum() > 0:
equal = False
return equal
for model_class in self.all_model_classes:
config.torchscript = True
model_not_tied = model_class(config)
if model_not_tied.get_output_embeddings() is None:
continue
config_tied = copy.deepcopy(config)
config_tied.torchscript = False
model_tied = model_class(config_tied)
params_tied = list(model_tied.parameters())
# Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(check_same_values(embeddings, decoding))
# Check that after resize they remain tied.
model_tied.resize_token_embeddings(config.text_config.vocab_size + 10)
params_tied_2 = list(model_tied.parameters())
self.assertEqual(len(params_tied_2), len(params_tied))
@require_torch @require_torch
class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
......
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import LlavaNextImageProcessor
class LlavaNextImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
):
size = size if size is not None else {"shortest_edge": 20}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
def expected_output_image_shape(self, images):
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = LlavaNextImageProcessor if is_vision_available() else None
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
def setUp(self):
self.image_processor_tester = LlavaNextImageProcessingTester(self)
@property
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_center_crop"))
self.assertTrue(hasattr(image_processing, "center_crop"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "image_grid_pinpoints"))
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 20})
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
def test_select_best_resolution(self):
possible_resolutions = [[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]
# Test with a square aspect ratio
best_resolution = select_best_resolution((336, 336), possible_resolutions)
self.assertEqual(best_resolution, (672, 336))
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 1445, 3, 18, 18)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment