"examples/tensorflow/language-modeling/run_mlm.py" did not exist on "6287c929c162a417cd5d355c9113e9908710858d"
Unverified Commit 6b78360e authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Add Idefics2 (#30253)



* Initial add model additions

* Test

* All weights loading

* Can perform full forward pass

* Local and remote the same

* Matching local and remote

* Fixup

* Idefics2Model importable; fixup docstrings

* Don't skip by default

* Remove deprecated use_resampler arg

* Remove self.config

* DecoupledLinear takes config

* Tidy up

* Enable eager attention and tidy up

* Most tests passing

* Update for batch of processed images

* Add image processor

* Update doc pages

* Update conversion script

* Remove erroneous breakpoint

* Remove accidendtal spelling change

* Update to reflect changes on hub - make generate work

* Fix up

* Image processor tests

* Update tests

* Add a processor

* Add a processor

* Update convert script

* Update modeling file - remove fixmes

* Bug fix

* Add processing test

* Use processor

* Fix up

* Update src/transformers/models/idefics2/modeling_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update src/transformers/models/idefics2/modeling_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Fix test

* Update config - PR comments and defaults align with checkpoint

* Reviewer comments

* Add copied froms for flahs attention

* Update src/transformers/models/idefics2/modeling_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Apply suggestions from code review
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Remove qk_layer_norm and freeze_layers functionality

* Fix

* Remove freeze_layer options from config

* Sync with upstream main

* Fix attention shapes siglip

* Remove Llava-next refs - TO REBASE

* Use AutoModel for text model

* Add comment to explain vision embeddings

* Fix issue with tie_word_embeddings

* Address review comments

* Fix and fix up

* Chat templates for idefics

* Fix copies

* Fix

* Add layer norms to FA2

* Fix tests

* Apply suggestions from code review
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Fix

* Review comments

* Update src/transformers/models/idefics2/modeling_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update inputs merger

* Merge weights in correct order

* Update convert script

* Update src/transformers/models/idefics2/processing_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update template

* Model code examples (fix idefics too)

* More review comments

* Tidy up

* Update processing

* Fix attention mask preparation

* Update inputs_merger inputs

* Vectorize inputs_merger

* Update src/transformers/models/idefics2/__init__.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/idefics2/modeling_idefics2.py

* Review comments

* saying bye to the `qk_layer_norms`

* Simplify

* Update latents

* Remove erroneuous readme changes

* Return images when applying chat template

* Fix bug - prompt images are for a single sample

* Update src/transformers/models/idefics2/modeling_idefics2.py

* image splitting

* fix test

* some more comment

* some comment

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/idefics2/image_processing_idefics2.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update processor

* Update model tests

* Update src/transformers/models/idefics2/processing_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update src/transformers/models/idefics2/processing_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Don't add BOS in template

* Update src/transformers/models/idefics2/processing_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Remove index in examples

* Update tests to reflect #13

* Update src/transformers/models/idefics2/processing_idefics2.py
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* PR comment - consistent typing

* Update readme and model doc

* Update docs

* Update checkpoint references

* Update examples

* Fix and update tests

* Small addition

* Update tests - remove copied from as no ignore placement copy could be found

* Update example

* small fixes

* Update docs/source/en/model_doc/idefics2.md
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update docs/source/en/model_doc/idefics2.md
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Update README.md
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>

* Connector model as bridge

* Fix up

* Fix up

* Don't pass model inputs for generation kwargs update

* IDEFICS-2 -> Idefics2

* Remove config archive name

* IDEFICS-2 -> Idefics2

* Add back llava-next

* Update readmes

* Add requirements for processor tester

* Use custom convert_to_rgb to avoid possible BC

* Fix doc example

* Fix doc example

* Skip model doc tests - as model to large

* More doc example - account for image splitting

* Update src/transformers/image_transforms.py

* Fix config doctest

---------
Co-authored-by: default avatarPablo Montalvo <39954772+molbap@users.noreply.github.com>
Co-authored-by: default avatarArthurZucker <arthur.zucker@gmail.com>
Co-authored-by: default avatarVictor SANH <victorsanh@gmail.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 667939a2
...@@ -111,6 +111,7 @@ from . import ( ...@@ -111,6 +111,7 @@ from . import (
hubert, hubert,
ibert, ibert,
idefics, idefics,
idefics2,
imagegpt, imagegpt,
informer, informer,
instructblip, instructblip,
......
...@@ -125,6 +125,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ...@@ -125,6 +125,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("hubert", "HubertConfig"), ("hubert", "HubertConfig"),
("ibert", "IBertConfig"), ("ibert", "IBertConfig"),
("idefics", "IdeficsConfig"), ("idefics", "IdeficsConfig"),
("idefics2", "Idefics2Config"),
("imagegpt", "ImageGPTConfig"), ("imagegpt", "ImageGPTConfig"),
("informer", "InformerConfig"), ("informer", "InformerConfig"),
("instructblip", "InstructBlipConfig"), ("instructblip", "InstructBlipConfig"),
...@@ -283,6 +284,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ...@@ -283,6 +284,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
] ]
) )
MODEL_NAMES_MAPPING = OrderedDict( MODEL_NAMES_MAPPING = OrderedDict(
[ [
# Add full (and cased) model names here # Add full (and cased) model names here
...@@ -390,6 +392,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ...@@ -390,6 +392,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("hubert", "Hubert"), ("hubert", "Hubert"),
("ibert", "I-BERT"), ("ibert", "I-BERT"),
("idefics", "IDEFICS"), ("idefics", "IDEFICS"),
("idefics2", "Idefics2"),
("imagegpt", "ImageGPT"), ("imagegpt", "ImageGPT"),
("informer", "Informer"), ("informer", "Informer"),
("instructblip", "InstructBLIP"), ("instructblip", "InstructBLIP"),
......
...@@ -71,6 +71,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ...@@ -71,6 +71,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("grounding-dino", "GroundingDinoImageProcessor"), ("grounding-dino", "GroundingDinoImageProcessor"),
("groupvit", "CLIPImageProcessor"), ("groupvit", "CLIPImageProcessor"),
("idefics", "IdeficsImageProcessor"), ("idefics", "IdeficsImageProcessor"),
("idefics2", "Idefics2ImageProcessor"),
("imagegpt", "ImageGPTImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"),
("instructblip", "BlipImageProcessor"), ("instructblip", "BlipImageProcessor"),
("kosmos-2", "CLIPImageProcessor"), ("kosmos-2", "CLIPImageProcessor"),
......
...@@ -120,6 +120,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ...@@ -120,6 +120,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("hubert", "HubertModel"), ("hubert", "HubertModel"),
("ibert", "IBertModel"), ("ibert", "IBertModel"),
("idefics", "IdeficsModel"), ("idefics", "IdeficsModel"),
("idefics2", "Idefics2Model"),
("imagegpt", "ImageGPTModel"), ("imagegpt", "ImageGPTModel"),
("informer", "InformerModel"), ("informer", "InformerModel"),
("jukebox", "JukeboxModel"), ("jukebox", "JukeboxModel"),
...@@ -287,6 +288,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( ...@@ -287,6 +288,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("ibert", "IBertForMaskedLM"), ("ibert", "IBertForMaskedLM"),
("idefics", "IdeficsForVisionText2Text"), ("idefics", "IdeficsForVisionText2Text"),
("idefics2", "Idefics2ForConditionalGeneration"),
("layoutlm", "LayoutLMForMaskedLM"), ("layoutlm", "LayoutLMForMaskedLM"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"),
...@@ -678,6 +680,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( ...@@ -678,6 +680,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("blip", "BlipForConditionalGeneration"), ("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"),
("git", "GitForCausalLM"), ("git", "GitForCausalLM"),
("idefics2", "Idefics2ForConditionalGeneration"),
("instructblip", "InstructBlipForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
......
...@@ -61,6 +61,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ...@@ -61,6 +61,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("groupvit", "CLIPProcessor"), ("groupvit", "CLIPProcessor"),
("hubert", "Wav2Vec2Processor"), ("hubert", "Wav2Vec2Processor"),
("idefics", "IdeficsProcessor"), ("idefics", "IdeficsProcessor"),
("idefics2", "Idefics2Processor"),
("instructblip", "InstructBlipProcessor"), ("instructblip", "InstructBlipProcessor"),
("kosmos-2", "Kosmos2Processor"), ("kosmos-2", "Kosmos2Processor"),
("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv2", "LayoutLMv2Processor"),
......
...@@ -201,6 +201,7 @@ else: ...@@ -201,6 +201,7 @@ else:
("hubert", ("Wav2Vec2CTCTokenizer", None)), ("hubert", ("Wav2Vec2CTCTokenizer", None)),
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("jukebox", ("JukeboxTokenizer", None)), ("jukebox", ("JukeboxTokenizer", None)),
( (
......
...@@ -1458,18 +1458,27 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel): ...@@ -1458,18 +1458,27 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
Example: Example:
```python ```python
>>> from transformers import AutoTokenizer, IdeficsForVisionText2Text >>> from transformers import AutoProcessor, IdeficsForVisionText2Text
>>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b") >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceM4/idefics-9b") >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")
>>> prompt = "Hey, are you consciours? Can you talk to me?" >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
>>> inputs = tokenizer(prompt, return_tensors="pt") >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"
>>> # Generate >>> prompts = [
>>> generate_ids = model.generate(inputs.input_ids, max_length=30) ... [
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] ... "User:",
"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." ... dogs_image_url_1,
... "Describe this image.\nAssistant: An image of two dogs.\n",
... "User:",
... dogs_image_url_2,
... "Describe this image.\nAssistant:",
... ]
... ]
>>> inputs = processor(prompts, return_tensors="pt")
>>> generate_ids = model.generate(**inputs, max_new_tokens=6)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)
```""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_idefics2": ["Idefics2Config"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_idefics2"] = ["Idefics2ImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_idefics2"] = [
"IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST",
"Idefics2ForConditionalGeneration",
"Idefics2PreTrainedModel",
"Idefics2Model",
]
_import_structure["processing_idefics2"] = ["Idefics2Processor"]
if TYPE_CHECKING:
from .configuration_idefics2 import Idefics2Config
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_idefics2 import Idefics2ImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_idefics2 import (
IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST,
Idefics2ForConditionalGeneration,
Idefics2Model,
Idefics2PreTrainedModel,
)
from .processing_idefics2 import Idefics2Processor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Idefics2 model configuration"""
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
class Idefics2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Idefics2VisionModel`]. It is used to instantiate a
Idefics2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics2 model
[HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
intializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation for initializing all weight matrices in the model.
Example:
```python
>>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
>>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
>>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = Idefics2VisionConfig()
>>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = Idefics2VisionTransformer(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "idefics2"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=32,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
initializer_range=0.02,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the vision config dict if we are loading from Idefics2Config
if config_dict.get("model_type") == "idefics2":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class Idefics2PerceiverConfig(PretrainedConfig):
r"""
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the perceiver block.
resampler_n_latents (`int`, *optional*, defaults to 64):
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
resampler_depth (`int`, *optional*, defaults to 3):
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
resampler_n_heads (`int`, *optional*, defaults to 16):
Number of heads in each Transformer block (for multi-headed self-attention).
resampler_head_dim (`int`, *optional*, defaults to 96):
Dimensionality of each head projection in the Transformer block.
num_key_value_heads (`int`, *optional*, defaults to 4):
Number of key-value heads in the perceiver attention block.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
"""
model_type = "idefics2"
def __init__(
self,
hidden_act="silu",
resampler_n_latents=64,
resampler_depth=3,
resampler_n_heads=16,
resampler_head_dim=96,
num_key_value_heads=4,
attention_dropout=0.0,
**kwargs,
):
self.hidden_act = hidden_act
self.resampler_n_latents = resampler_n_latents
self.resampler_depth = resampler_depth
self.resampler_n_heads = resampler_n_heads
self.num_key_value_heads = num_key_value_heads
self.resampler_head_dim = resampler_head_dim
self.attention_dropout = attention_dropout
if self.num_key_value_heads > self.resampler_n_heads:
raise ValueError(
f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to"
f" resampler_n_heads={self.resampler_n_heads}"
)
super().__init__(**kwargs)
class Idefics2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Idefics2Model`]. It is used to instantiate a
Idefics2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the Idefics2
[HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should cache the key/value pairs of the attention mechanism.
image_token_id (`int`, *optional*, defaults to 32001):
The id of the "image" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to tie the word embeddings with the token embeddings.
vision_config (`IdeficsVisionConfig` or `dict`, *optional*):
Custom vision config or dict
perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*):
Custom perceiver config or dict
text_config (`MistralConfig` or `dict`, *optional*):
Custom text config or dict for the text model
Example:
```python
>>> from transformers import Idefics2Model, Idefics2Config
>>> # Initializing configuration
>>> configuration = Idefics2Config()
>>> # Initializing a model from the configuration
>>> model = Idefics2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "idefics2"
is_composition = True
def __init__(
self,
use_cache=True,
image_token_id=32_001,
tie_word_embeddings=False,
vision_config=None,
perceiver_config=None,
text_config=None,
**kwargs,
):
self.image_token_id = image_token_id
self.use_cache = use_cache
self.tie_word_embeddings = tie_word_embeddings
if perceiver_config is None:
self.perceiver_config = Idefics2PerceiverConfig()
logger.info("perciver_config is None, using default perceiver config")
elif isinstance(perceiver_config, dict):
self.perceiver_config = Idefics2PerceiverConfig(**perceiver_config)
elif isinstance(perceiver_config, Idefics2PerceiverConfig):
self.perceiver_config = perceiver_config
if vision_config is None:
self.vision_config = Idefics2VisionConfig()
logger.info("vision_config is None, using default vision config")
elif isinstance(vision_config, dict):
self.vision_config = Idefics2VisionConfig(**vision_config)
elif isinstance(vision_config, Idefics2VisionConfig):
self.vision_config = vision_config
if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "mistral"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
logger.info("text_config is None, using default text config")
text_config = CONFIG_MAPPING["mistral"](
max_position_embeddings=4096 * 8,
rms_norm_eps=1e-5,
# None in the original configuration_mistral, we set it to the unk_token_id
pad_token_id=0,
tie_word_embeddings=False,
)
self.text_config = text_config
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import copy
import torch
from accelerate import init_empty_weights
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
Idefics2Config,
Idefics2ForConditionalGeneration,
Idefics2ImageProcessor,
Idefics2Processor,
MistralConfig,
)
EPILOG_TXT = """Example:
python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2
"""
KEYS_TO_MODIFY_MAPPING = {
"lm_head.weight": "lm_head.linear.weight",
"model.layers": "model.text_model.layers",
"model.norm": "model.text_model.norm",
"model.perceiver_resampler": "model.connector.perceiver_resampler",
"model.modality_projection": "model.connector.modality_projection",
}
WEIGHTS_TO_MERGE_MAPPING = (
# (weights to merge in merging order), (new weight name)
(
("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
"model.text_model.embed_tokens.weight",
),
(("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
)
def convert_state_dict_to_hf(state_dict):
new_state_dict = {}
for key, value in state_dict.items():
if key.endswith(".inv_freq"):
continue
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
new_state_dict[key] = value
return new_state_dict
def merge_weights(state_dict):
new_state_dict = copy.deepcopy(state_dict)
# Merge the weights
for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
for weight in weights_to_merge:
assert weight in state_dict, f"Weight {weight} is missing in the state dict"
if new_weight_name not in new_state_dict:
new_state_dict[new_weight_name] = [state_dict[weight]]
else:
new_state_dict[new_weight_name].append(state_dict[weight])
new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
# Remove the weights that were merged
for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
for weight in weights_to_merge:
if weight in new_state_dict and weight != new_weight_name:
new_state_dict.pop(weight)
return new_state_dict
def get_config(checkpoint):
if checkpoint == "HuggingFaceM4/idefics2":
# We load the config then recreate to use the text_config
config = AutoConfig.from_pretrained(checkpoint)
text_config = MistralConfig(
vocab_size=config.vocab_size + config.additional_vocab_size,
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
num_hidden_layers=config.num_hidden_layers,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
hidden_act=config.hidden_act,
max_position_embeddings=config.max_position_embeddings,
initializer_range=config.initializer_range,
rms_norm_eps=config.rms_norm_eps,
tie_word_embeddings=config.tie_word_embeddings,
rope_theta=config.rope_theta,
sliding_window=config.sliding_window,
attention_dropout=config.attention_dropout,
pad_token_id=config.pad_token_id,
bos_token_id=config.bos_token_id,
eos_token_id=config.eos_token_id,
)
perceiver_config = config.perceiver_config.to_dict()
config = Idefics2Config(
text_config=text_config.to_dict(),
vision_config=config.vision_config,
perceiver_config=perceiver_config,
use_cache=config.use_cache,
image_token_id=config.image_token_id,
tie_word_embeddings=config.tie_word_embeddings,
)
return config
return AutoConfig.from_pretrained(checkpoint)
def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
# The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration
original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True)
# The original model doesn't use the idefics2 processing objects
image_seq_len = original_model.config.perceiver_config.resampler_n_latents
image_processor = Idefics2ImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(original_model_id)
processor = Idefics2Processor(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_len=image_seq_len,
)
state_dict = original_model.state_dict()
state_dict = convert_state_dict_to_hf(state_dict)
# Merge weights
state_dict = merge_weights(state_dict)
config = get_config(original_model_id)
with init_empty_weights():
model = Idefics2ForConditionalGeneration(config)
model.load_state_dict(state_dict, strict=True, assign=True)
model.save_pretrained(output_hub_path)
processor.save_pretrained(output_hub_path)
if push_to_hub:
model.push_to_hub(output_hub_path, private=True)
processor.push_to_hub(output_hub_path, private=True)
def main():
parser = argparse.ArgumentParser(
epilog=EPILOG_TXT,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--original_model_id",
help="Hub location of the text model",
)
parser.add_argument(
"--output_hub_path",
help="Location on the hub of the converted model",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="If set, the model will be pushed to the hub after conversion.",
)
args = parser.parse_args()
convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
if __name__ == "__main__":
main()
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
from PIL import Image
def get_resize_output_image_size(image, size, input_data_format) -> Tuple[int, int]:
"""
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image containing the keys "shortest_edge" and "longest_edge".
input_data_format (`ChannelDimension` or `str`):
The channel dimension format of the input image.
Returns:
The output size of the image after resizing.
"""
height, width = get_image_size(image, channel_dim=input_data_format)
min_len = size["shortest_edge"]
max_len = size["longest_edge"]
aspect_ratio = width / height
if width >= height and width > max_len:
width = max_len
height = int(width / aspect_ratio)
elif height > width and height > max_len:
height = max_len
width = int(height * aspect_ratio)
height = max(height, min_len)
width = max(width, min_len)
return height, width
def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
"""
Convert a single image or a list of images to a list of numpy arrays.
Args:
images (`ImageInput`):
A single image or a list of images.
Returns:
A list of numpy arrays.
"""
# If it's a single image, convert it to a list of lists
if is_valid_image(images):
images = [[images]]
# If it's a list of images, it's a single batch, so convert it to a list of lists
elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
images = [images]
# If it's a list of batches, it's already in the right format
elif (
isinstance(images, (list, tuple))
and len(images) > 0
and isinstance(images[0], (list, tuple))
and is_valid_image(images[0][0])
):
pass
else:
raise ValueError(
"Invalid input type. Must be a single image, a list of images, or a list of batches of images."
)
return images
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Return the maximum value across all indices of an iterable of values.
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images_list: List[List[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images_list[0][0])
image_sizes = []
for images in images_list:
for image in images:
image_sizes.append(get_image_size(image, channel_dim=input_data_format))
max_height, max_width = max_across_indices(image_sizes)
return (max_height, max_width)
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
Args:
image (`np.ndarray`):
Image to make the pixel mask for.
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
# FIXME Amy: merge this function with the one in image_transforms.py
def convert_to_rgb(image: ImageInput) -> ImageInput:
"""
Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
as is.
Args:
image (Image):
The image to convert.
"""
if not isinstance(image, PIL.Image.Image):
return image
# `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
# for transparent images. The call to `alpha_composite` handles this case
if image.mode == "RGB":
return image
image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite
class Idefics2ImageProcessor(BaseImageProcessor):
r"""
Constructs a Idefics image processor.
Args:
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
Only has an effect if the input image is in the PIL format.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image. The longest edge of the image is resized to be <= `size["longest_edge"]`, with the
shortest edge resized to keep the input aspect ratio, with a minimum size of `size["shortest_edge"]`.
size (`Dict`, *optional*):
Controls the size of the output image. This is a dictionary containing the keys "shortest_edge" and "longest_edge".
resample (`Resampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
rescale_factor (`float`, *optional*, defaults to `1/255`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
a standard deviation of `image_std`.
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Whether or not to pad the images to the largest height and width in the batch and number of images per
sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
do_image_splitting (`bool`, *optional*, defaults to `False`):
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That
strategy was first introduced in https://arxiv.org/abs/2311.06607.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_convert_rgb: bool = True,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = True,
do_image_splitting: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.do_convert_rgb = do_convert_rgb
self.do_resize = do_resize
self.size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980}
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_pad = do_pad
self.do_image_splitting = do_image_splitting
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(image, size, input_data_format)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(
"size must be a dictionary with keys 'shortest_edge' and 'longest_edge' or 'height' and 'width'."
)
return resize(
image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
def pad(
self,
images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> BatchFeature:
"""
For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width.
For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
Args:
images (`np.ndarray`):
List of list of images to pad. Pads to the largest height and width in the batch.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
batch_size = len(images)
max_num_images = max(len(images_) for images_ in images)
input_data_format = (
infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format
)
data_format = input_data_format if data_format is None else data_format
def empty_image(size, input_data_format):
if input_data_format == ChannelDimension.FIRST:
return np.zeros((3, *size), dtype=np.uint8)
elif input_data_format == ChannelDimension.LAST:
return np.zeros((*size, 3), dtype=np.uint8)
raise ValueError("Invalid channel dimension format.")
padded_images_list = [
[empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
]
padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)]
for batch_idx in range(batch_size):
for sample_idx, image in enumerate(images[batch_idx]):
padded_images_list[batch_idx][sample_idx] = self._pad_image(
image,
pad_size,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
padded_masks[batch_idx][sample_idx] = make_pixel_mask(
image, output_size=pad_size, input_data_format=input_data_format
)
padded_masks = padded_masks if return_pixel_mask else None
return padded_images_list, padded_masks
def _crop(
self,
im: np.ndarray,
w1: int,
h1: int,
w2: int,
h2: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
if input_data_format == ChannelDimension.FIRST:
return im[:, h1:h2, w1:w2]
elif input_data_format == ChannelDimension.LAST:
return im[h1:h2, w1:w2, :]
def split_image(
self,
image: np.ndarray,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""
Split an image into 4 equal sub-images, and the concatenate that sequence with the original image.
That means that a single image becomes a sequence of 5 images.
This is a "trick" to spend more compute on each image with no changes in the vision encoder.
Args:
image (`np.ndarray`):
Images to split.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
height, width = get_image_size(image, input_data_format)
mid_width = width // 2
mid_height = height // 2
return [
self._crop(image, 0, 0, mid_width, mid_height, input_data_format),
self._crop(image, mid_width, 0, width, mid_height, input_data_format),
self._crop(image, 0, mid_height, mid_width, height, input_data_format),
self._crop(image, mid_width, mid_height, width, height, input_data_format),
image,
]
def preprocess(
self,
images: ImageInput,
do_convert_rgb: Optional[bool] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
do_image_splitting: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
input_data_format: Optional[ChannelDimension] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
):
"""
Preprocess a batch of images.
Args:
images (`ImageInput`):
A list of images to preprocess.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
Whether or not to pad the images to the largest height and width in the batch.
do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`):
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That
strategy was first introduced in https://arxiv.org/abs/2311.06607.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
do_pad = do_pad if do_pad is not None else self.do_pad
do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
images_list = make_list_of_images(images)
if not valid_images(images_list[0]):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_resize=do_resize,
size=size,
resample=resample,
)
if do_convert_rgb:
images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
# All transformations expect numpy arrays.
images_list = [[to_numpy_array(image) for image in images] for images in images_list]
if is_scaled_image(images_list[0][0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images_list[0][0])
if do_image_splitting:
new_images_list = []
for images in images_list:
new_images = []
for image in images:
new_images.extend(self.split_image(image, input_data_format))
new_images_list.append(new_images)
images_list = new_images_list
if do_resize:
images_list = [
[
self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
for image in images
]
for images in images_list
]
if do_rescale:
images_list = [
[
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
for images in images_list
]
if do_normalize:
images_list = [
[
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
for images in images_list
]
pixel_attention_mask = None
if do_pad:
images_list, pixel_attention_mask = self.pad(
images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
)
if data_format is not None:
images_list = [
[
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
for images in images_list
]
data = {"pixel_values": np.array(images_list) if do_pad else images_list} # Faster tensor conversion
if pixel_attention_mask is not None:
data["pixel_attention_mask"] = np.array(pixel_attention_mask) if do_pad else pixel_attention_mask
return BatchFeature(data=data, tensor_type=return_tensors)
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics2 model."""
import inspect
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ... import PreTrainedModel
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, ModelOutput
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from ..auto import AutoModel
from .configuration_idefics2 import Idefics2Config, Idefics2VisionConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Idefics2Config"
IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"HuggingFaceM4/idefics2-8b",
# See all IDEFICS2 models at https://huggingface.co/models?filter=idefics2
]
@dataclass
class Idefics2BaseModelOutputWithPast(ModelOutput):
"""
Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""
last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2
class Idefics2CausalLMOutputWithPast(ModelOutput):
"""
Base class for Idefics2 causal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
class Idefics2VisionEmbeddings(nn.Module):
"""
This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
resolution.
The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
which allows treating images in their native aspect ratio and without the need to resize them to the same
fixed size. In particular, we start from the original pre-trained SigLIP model
(which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
"""
def __init__(self, config: Idefics2VisionConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="valid",
)
self.num_patches_per_side = self.image_size // self.patch_size
self.num_patches = self.num_patches_per_side**2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
batch_size, _, max_im_h, max_im_w = pixel_values.shape
patch_embeds = self.patch_embedding(pixel_values)
embeddings = patch_embeds.flatten(2).transpose(1, 2)
max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
nb_patches_h = p_attn_mask[:, 0].sum()
nb_patches_w = p_attn_mask[0].sum()
fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
position_ids = position_ids.to(self.position_embedding.weight.device)
embeddings = embeddings + self.position_embedding(position_ids)
return embeddings
# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision
class Idefics2VisionAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# Ignore copy
self.is_causal = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
batch_size, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
k_v_seq_len = key_states.shape[-2]
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
raise ValueError(
f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
raise ValueError(
f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights
class Idefics2VisionFlashAttention2(Idefics2VisionAttention):
"""
Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (Idefics2VisionRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = self._flash_attention_forward(
query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
)
attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
attn_output = self.out_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
causal = self.is_causal and query_length != 1
# Contains at least one padding token in the sequence
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # There is a memcpy here, that is very bad.
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# The -q_len: slice assumes left padding.
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
IDEFICS_VISION_ATTENTION_CLASSES = {
"eager": Idefics2VisionAttention,
"flash_attention_2": Idefics2VisionFlashAttention2,
}
# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision
class Idefics2VisionMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class Idefics2MLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
output_size: int,
hidden_act: str,
):
super().__init__()
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.down_proj = nn.Linear(intermediate_size, output_size, bias=False)
self.act_fn = ACT2FN[hidden_act]
def forward(self, x):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# Copied from transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead with Siglip->Idefics2
class Idefics2MultiheadAttentionPoolingHead(nn.Module):
"""Multihead Attention Pooling."""
def __init__(self, config: Idefics2VisionConfig):
super().__init__()
self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Ignore copy
self.mlp = Idefics2MLP(
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
output_size=config.hidden_size,
)
def forward(self, hidden_state):
batch_size = hidden_state.shape[0]
probe = self.probe.repeat(batch_size, 1, 1)
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
residual = hidden_state
hidden_state = self.layernorm(hidden_state)
hidden_state = residual + self.mlp(hidden_state)
return hidden_state[:, 0]
class Idefics2EncoderLayer(nn.Module):
def __init__(self, config: Idefics2Config):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = Idefics2VisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`):
Input to the layer of shape `(batch, seq_len, embed_dim)`.
attention_mask (`torch.FloatTensor`):
Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics2
class Idefics2Encoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Idefics2EncoderLayer`].
Args:
config: Idefics2Config
"""
def __init__(self, config: Idefics2Config):
super().__init__()
self.config = config
self.layers = nn.ModuleList([Idefics2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
# Ignore copy
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for encoder_layer in self.layers:
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
class Idefics2VisionTransformer(nn.Module):
def __init__(self, config: Idefics2VisionConfig):
super().__init__()
embed_dim = config.hidden_size
self.config = config
self.embeddings = Idefics2VisionEmbeddings(config)
self.encoder = Idefics2Encoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings = value
def forward(
self,
pixel_values,
patch_attention_mask: Optional[torch.BoolTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
batch_size = pixel_values.size(0)
if patch_attention_mask is None:
patch_size = self.config.patch_size
patch_attention_mask = torch.ones(
(
batch_size,
pixel_values.size(2) // patch_size,
pixel_values.size(3) // patch_size,
)
)
patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
patch_attention_mask = patch_attention_mask.view(batch_size, -1)
# The call to `_upad_input` in `_flash_attention_forward` is expensive
# So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
# avoiding passing the attention_mask, which is equivalent to attending to the full sequence
if not torch.any(~patch_attention_mask):
patch_attention_mask = None
elif not self._use_flash_attention_2:
patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=patch_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.post_layernorm(last_hidden_state)
if not return_dict:
return (last_hidden_state,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=last_hidden_state,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics2
class Idefics2RMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Idefics2RMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class Idefics2PerceiverAttention(nn.Module):
def __init__(self, config, layer_idx: Optional[int] = None) -> None:
"""Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
super().__init__()
self.layer_idx = None
self.hidden_size = config.text_config.hidden_size
self.num_heads = config.perceiver_config.resampler_n_heads
self.head_dim = config.perceiver_config.resampler_head_dim
self.num_key_value_heads = config.perceiver_config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.attention_dropout = config.perceiver_config.attention_dropout
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self.is_causal = False
def forward(
self,
latents: torch.Tensor,
context: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
Args:
latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
attention_mask (`torch.Tensor`, *optional*): Tensor of shape [bsz, 1, seq, n_latents] representing attention mask.
position_ids (`torch.LongTensor`, *optional*): Tensor of shape [bsz, seq] representing position indices of each input token.
past_key_value (`Tuple[torch.Tensor]`, *optional*): Tuple of tensors containing cached key and value states.
output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching.
"""
bsz, q_len, _ = latents.size()
kv_seq_len = q_len + context.size()[1]
hidden_states = torch.concat([context, latents], dim=-2)
query_states = self.q_proj(latents)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
past_key_value = getattr(self, "past_key_value", past_key_value)
if past_key_value is not None:
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
# repeat k/v heads if n_kv_heads < n_heads
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with MistralAttention->Idefics2PerceiverAttention,MistralFlashAttention->Idefics2PerceiverFlashAttention,Mistral->Idefics2
class Idefics2PerceiverFlashAttention2(Idefics2PerceiverAttention):
"""
Idefics2 flash attention module. This module inherits from `Idefics2PerceiverAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
# Ignore copy
def forward(
self,
latents: torch.Tensor,
context: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = latents.size()
kv_seq_len = q_len + context.size()[1]
# Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
# Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
query_states = self.q_proj(latents)
key_states = self.k_proj(torch.cat([context, latents], dim=-2))
value_states = self.v_proj(torch.cat([context, latents], dim=-2))
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
if past_key_value is not None:
# Activate slicing cache only if the config has a value `sliding_windows` attribute
if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
slicing_tokens = kv_seq_len - self.config.sliding_window
past_key = past_key_value[0]
past_value = past_key_value[1]
past_key = past_key[:, :, slicing_tokens:, :].contiguous()
past_value = past_value[:, :, slicing_tokens:, :].contiguous()
if past_key.shape[-2] != self.config.sliding_window - 1:
raise ValueError(
"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1,"
f" head_dim`), got {past_key.shape}"
)
past_key_value = (past_key, past_value)
if attention_mask is not None:
attention_mask = attention_mask[:, slicing_tokens:]
attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
past_key_value = (key_states, value_states) if use_cache else None
# repeat k/v heads if n_kv_heads < n_heads
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
dropout_rate = 0.0 if not self.training else self.attention_dropout
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in float16 just to be sure everything works as expected.
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
# Reashape to the expected shape for Flash Attention
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
attn_output = self._flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
q_len,
dropout=dropout_rate,
use_sliding_windows=False,
)
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
def _flash_attention_forward(
self,
query_states,
key_states,
value_states,
attention_mask,
query_length,
dropout=0.0,
softmax_scale=None,
use_sliding_windows=False,
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
use_sliding_windows (`bool`, *optional*):
Whether to activate sliding window attention.
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
causal = self.is_causal and query_length != 1
# Contains at least one padding token in the sequence
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
if not use_sliding_windows:
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
else:
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
window_size=(self.config.sliding_window, self.config.sliding_window),
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
if not use_sliding_windows:
attn_output = flash_attn_func(
query_states,
key_states,
value_states,
dropout,
softmax_scale=softmax_scale,
causal=causal,
)
else:
attn_output = flash_attn_func(
query_states,
key_states,
value_states,
dropout,
softmax_scale=softmax_scale,
causal=causal,
window_size=(self.config.sliding_window, self.config.sliding_window),
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
# On the first iteration we need to properly re-create the padding mask
# by slicing it on the proper place
if kv_seq_len != attention_mask.shape[-1]:
attention_mask_num_tokens = attention_mask.shape[-1]
attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # There is a memcpy here, that is very bad.
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# The -q_len: slice assumes left padding.
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
IDEFICS2_PERCEIVER_ATTENTION_CLASSES = {
"eager": Idefics2PerceiverAttention,
"flash_attention_2": Idefics2PerceiverFlashAttention2,
}
class Idefics2PerceiverLayer(nn.Module):
def __init__(self, config, layer_idx: int):
super().__init__()
self.hidden_size = config.text_config.hidden_size
self.n_latents = config.perceiver_config.resampler_n_latents
self.depth = config.perceiver_config.resampler_depth
self.rms_norm_eps = config.text_config.rms_norm_eps
self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
self.self_attn = IDEFICS2_PERCEIVER_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
self.mlp = Idefics2MLP(
hidden_size=config.text_config.hidden_size,
intermediate_size=config.text_config.hidden_size * 4,
output_size=config.text_config.hidden_size,
hidden_act=config.perceiver_config.hidden_act,
)
def forward(
self,
latents: torch.Tensor,
context: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
**kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, sequence_length)` where padding elements are indicated by 0.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = latents
latents = self.input_latents_norm(latents)
context = self.input_context_norm(context)
latents, self_attn_weights, present_key_value = self.self_attn(
latents=latents,
context=context,
attention_mask=attention_mask,
)
latents = residual + latents
residual = latents
latents = self.post_attention_layernorm(latents)
latents = self.mlp(latents)
latents = residual + latents
outputs = (latents,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class Idefics2PerceiverResampler(nn.Module):
def __init__(self, config) -> None:
"""
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. The Resampler acts as a form of learned pooling and
is derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206).
"""
super().__init__()
self.hidden_size = config.text_config.hidden_size
self.hidden_act = config.perceiver_config.hidden_act
self.n_latents = config.perceiver_config.resampler_n_latents
self.depth = config.perceiver_config.resampler_depth
self.rms_norm_eps = config.text_config.rms_norm_eps
# Create Latents for Perceiver
self.latents = nn.Parameter(torch.ones(self.n_latents, self.hidden_size))
# Create Transformer Blocks
self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
def forward(
self,
context: torch.Tensor,
attention_mask,
) -> torch.Tensor:
# seq embed -> bsz seq embed
latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size()))
latent_attention_mask = torch.ones(
(attention_mask.size(0), latents.size(1)), dtype=attention_mask.dtype, device=attention_mask.device
)
attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
attention_mask = (
_prepare_4d_attention_mask(attention_mask, latents.dtype, tgt_len=self.n_latents)
if not self._use_flash_attention_2
else attention_mask
)
compressed_context = latents
for perceiver_layer in self.layers:
layer_outputs = perceiver_layer(
compressed_context,
context,
attention_mask=attention_mask,
position_ids=None,
past_key_value=None,
output_attentions=False,
use_cache=False,
)
compressed_context = layer_outputs[0]
compressed_context = self.norm(compressed_context)
return compressed_context
class Idefics2Connector(nn.Module):
def __init__(self, config):
super().__init__()
self.modality_projection = Idefics2MLP(
hidden_size=config.vision_config.hidden_size,
intermediate_size=config.text_config.intermediate_size,
output_size=config.text_config.hidden_size,
hidden_act=config.text_config.hidden_act,
)
self.perceiver_resampler = Idefics2PerceiverResampler(config)
def forward(self, image_hidden_states, attention_mask):
image_hidden_states = self.modality_projection(image_hidden_states)
image_hidden_states = self.perceiver_resampler(context=image_hidden_states, attention_mask=attention_mask)
return image_hidden_states
IDEFICS2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Idefics2Config`] or [`Idefics2VisionConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Idefics2 Model outputting raw hidden-states without any specific head on top.",
IDEFICS2_START_DOCSTRING,
)
class Idefics2PreTrainedModel(PreTrainedModel):
config_class = Idefics2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
def _init_weights(self, module):
# important: this ported version of Idefics2 isn't meant for training from scratch - only
# inference and fine-tuning - so the proper init weights code has been removed - the original codebase
# https://github.com/haotian-liu/LLaVA/tree/main/idefics2 should serve for that purpose
std = (
self.config.text_config.initializer_range
if hasattr(self.config, "initializer_range")
else self.config.text_config.initializer_range
)
if hasattr(module, "class_embedding"):
module.class_embedding.data.normal_(mean=0.0, std=std)
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@classmethod
def _autoset_attn_implementation(
cls,
config,
use_flash_attention_2: bool = False,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, Dict[str, int]]] = None,
check_device_map: bool = True,
**kwargs,
):
"""
Overrides the method in `PreTrainedModel` to update the vision config with the correct attention implementation
"""
config = super()._autoset_attn_implementation(
config=config,
use_flash_attention_2=use_flash_attention_2,
torch_dtype=torch_dtype,
device_map=device_map,
check_device_map=check_device_map,
**kwargs,
)
config.vision_config._attn_implementation = config._attn_implementation
return config
IDEFICS2_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
The tensors corresponding to the input images. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
[`CLIPImageProcessor`] for processing images).
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
Mask to avoid performing attention on padding pixel indices.
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
The hidden states of the image encoder after modality projection and perceiver resampling.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"""Idefics2 model consisting of a SIGLIP vision encoder and Mistral language decoder""",
IDEFICS2_START_DOCSTRING,
)
class Idefics2Model(Idefics2PreTrainedModel):
def __init__(self, config: Idefics2Config):
super().__init__(config)
self.padding_idx = self.config.text_config.pad_token_id
self.vocab_size = self.config.text_config.vocab_size
self.vision_model = Idefics2VisionTransformer(config.vision_config)
self.connector = Idefics2Connector(config)
self.text_model = AutoModel.from_config(config.text_config)
self.image_seq_len = config.perceiver_config.resampler_n_latents
self.image_token_id = self.config.image_token_id
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.post_init()
def enable_input_require_grads(self):
"""
Enables the gradients for the input embeddings.
This is useful for lora when using gradient checkpointing.
c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
"""
def get_lowest_module(module):
if len(list(module.children())) == 0:
# If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
return module
else:
# Recursively call the function on each child module
return get_lowest_module(list(module.children())[0])
def make_inputs_require_grads(module, input, output):
output.requires_grad_(True)
self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
make_inputs_require_grads
)
def get_input_embeddings(self):
return self.text_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.text_model.set_input_embeddings(value)
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
model_embeds = self.text_model.resize_token_embeddings(
new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of
)
self.config.text_config.vocab_size = model_embeds.num_embeddings
return model_embeds
def inputs_merger(
self,
input_ids: torch.LongTensor,
inputs_embeds: Optional[torch.Tensor],
image_hidden_states: Optional[torch.Tensor],
):
"""
This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
The merging happens as follows:
- The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
- We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space.
We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
- The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
- To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
"""
num_images, _, vision_hidden_size = image_hidden_states.shape
special_image_token_mask = input_ids == self.image_token_id
new_inputs_embeds = inputs_embeds.clone()
reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
return new_inputs_embeds
@add_start_docstrings_to_model_forward(
"""
Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
max_num_images is the maximum number of images among the batch_size samples in the batch.
Padding images are not needed beyond padding the pixel_values at the entrance of the model.
For efficiency, we only pass through the vision_model's forward the real images by
discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
""",
IDEFICS2_INPUTS_DOCSTRING,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_attention_mask: Optional[torch.BoolTensor] = None,
image_hidden_states: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Idefics2BaseModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
past_seen_tokens = 0
if use_cache:
if not isinstance(past_key_values, Cache):
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
past_seen_tokens = past_key_values.get_usable_length(seq_length)
if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
if inputs_embeds is None:
inputs_embeds = self.text_model.get_input_embeddings()(input_ids)
# START VISUAL INPUTS INTEGRATION
if pixel_values is not None and image_hidden_states is not None:
raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
elif pixel_values is not None:
batch_size, num_images, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility
pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
# Remove padding images - padding images are full 0.
nb_values_per_image = pixel_values.shape[1:].numel()
real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
pixel_values = pixel_values[real_images_inds].contiguous()
# Handle the vision attention mask
if pixel_attention_mask is None:
pixel_attention_mask = torch.ones(
size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
dtype=torch.bool,
device=pixel_values.device,
)
else:
# Remove padding images from the mask/pP p
pixel_attention_mask = pixel_attention_mask.view(
batch_size * num_images, *pixel_attention_mask.shape[2:]
)
pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
patch_size = self.config.vision_config.patch_size
patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
# Get sequence from the vision encoder
image_hidden_states = self.vision_model(
pixel_values=pixel_values,
patch_attention_mask=patch_attention_mask,
).last_hidden_state
# Modality projection & resampling
image_hidden_states = self.connector(
image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
)
elif image_hidden_states is not None:
image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None:
# When we generate, we don't want to replace the potential image_token_id that we generated by images
# that simply don't exist
inputs_embeds = self.inputs_merger(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
image_hidden_states=image_hidden_states,
)
outputs = self.text_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
return Idefics2BaseModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_hidden_states,
)
@add_start_docstrings(
"""The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """,
IDEFICS2_START_DOCSTRING,
)
class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = Idefics2Model(config)
self.image_token_id = self.config.image_token_id
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.vocab_size = config.text_config.vocab_size
# Initialize weights and apply final processing
self.post_init()
def enable_input_require_grads(self):
"""
Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
the model weights fixed.
"""
def make_inputs_require_grads(module, input, output):
output.requires_grad_(True)
self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
make_inputs_require_grads
)
def get_input_embeddings(self):
return self.model.text_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.model.text_model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
# model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
if new_num_tokens is None and pad_to_multiple_of is None:
return model_embeds
# Update base model and current model config
# Ignore copy
self.config.text_config.vocab_size = model_embeds.weight.shape[0]
self.vocab_size = self.config.text_config.vocab_size
# Tie weights again if needed
self.tie_weights()
return model_embeds
def tie_weights(self):
"""
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
"""
output_embeddings = self.get_output_embeddings()
input_embeddings = self.get_input_embeddings()
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings.weight = input_embeddings.weight
@add_start_docstrings_to_model_forward(IDEFICS2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_attention_mask: Optional[torch.BoolTensor] = None,
image_hidden_states: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Idefics2CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModelForVision2Seq
>>> from transformers.image_utils import load_image
>>> DEVICE = "cuda:0"
>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
>>> model = AutoModelForVision2Seq.from_pretrained(
... "HuggingFaceM4/idefics2-8b-base",
>>> ).to(DEVICE)
>>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
>>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
>>> # Create inputs
>>> prompts = [
... "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
... "In which city is that bridge located?<image>",
>>> ]
>>> images = [[image1, image2], [image3]]
>>> inputs = processor(text=prompts, padding=True, return_tensors="pt").to(DEVICE)
>>> # Generate
>>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=500)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
>>> print(generated_texts)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
pixel_values=pixel_values,
pixel_attention_mask=pixel_attention_mask,
image_hidden_states=image_hidden_states,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
logits = self.lm_head(hidden_states)
logits = logits.float()
loss = None
if labels is not None:
labels = labels.to(logits.device)
# Shift so that tokens < n predict n
if attention_mask is not None:
shift_attention_mask = attention_mask[..., 1:].to(logits.device)
shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
else:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=self.image_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return Idefics2CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=outputs.image_hidden_states,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
# Omit tokens covered by past_key_values
if past_key_values is not None:
if isinstance(past_key_values, Cache):
cache_length = past_key_values.get_seq_length()
past_length = past_key_values.seen_tokens
max_cache_length = past_key_values.get_max_length()
else:
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None
# Keep only the unprocessed tokens:
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
# input)
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
# input_ids based on the past_length.
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
image_hidden_states = kwargs.get("image_hidden_states", None)
if image_hidden_states is not None:
pixel_values = None
pixel_attention_mask = None
else:
pixel_values = kwargs.get("pixel_values", None)
pixel_attention_mask = kwargs.get("pixel_attention_mask", None)
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"pixel_attention_mask": pixel_attention_mask,
"image_hidden_states": image_hidden_states,
}
)
return model_inputs
def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
model_kwargs = super()._update_model_kwargs_for_generation(
outputs=outputs,
model_kwargs=model_kwargs,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
# Get the precomputed image_hidden_states
model_kwargs["image_hidden_states"] = outputs.image_hidden_states
return model_kwargs
@staticmethod
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for IDEFICS2.
"""
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image, load_image
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
from ...utils import TensorType, logging
if TYPE_CHECKING:
from ...pipelines.conversational import Conversation
from ...tokenization_utils_base import PreTokenizedInput
logger = logging.get_logger(__name__)
def is_url(val) -> bool:
return isinstance(val, str) and val.startswith("http")
def is_image_or_image_url(elem):
return is_url(elem) or is_valid_image(elem)
class Idefics2Processor(ProcessorMixin):
r"""
Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
[`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
Args:
image_processor (`Idefics2ImageProcessor`):
An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
tokenizer (`PreTrainedTokenizerBase`, *optional*):
An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
image_seq_len (`int`, *optional*, defaults to 64):
The length of the image sequence i.e. the number of <image> tokens per image in the input.
This parameter is used to build the string from the input prompt and image tokens and should match the
config.perceiver_config.resampler_n_latents value for the model used.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "Idefics2ImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, **kwargs):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
self.image_token = AddedToken("<image>", normalized=False, special=True)
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
self.image_seq_len = image_seq_len
tokens_to_add = {
"additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token]
}
tokenizer.add_special_tokens(tokens_to_add)
# Stores a Jinja template that formats chat histories into tokenizable strings
self.chat_template = kwargs.pop("chat_template", None)
super().__init__(image_processor, tokenizer)
def _extract_images_from_prompts(self, prompts):
prompt_images = []
for prompt in prompts:
images = []
for elem in prompt:
if is_valid_image(elem):
images.append(elem)
elif is_url(elem):
images.append(load_image(elem))
prompt_images.append(images)
return prompt_images
def __call__(
self,
text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
image_seq_len: Optional[int] = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
is_split_into_words: bool = False,
add_special_tokens: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchEncoding:
"""
Processes the input prompts and returns a BatchEncoding.
Example:
```python
>>> import requests
>>> from transformers import Idefics2Processor
>>> from transformers.image_utils import load_image
>>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
>>> processor.image_processor.do_image_splitting = False # Force as False to simplify the example
>>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
>>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
>>> image1, image2 = load_image(url1), load_image(url2)
>>> images = [[image1], [image2]]
>>> text = [
... "<image>In this image, we see",
... "bla bla bla<image>",
... ]
>>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
>>> input_ids = outputs.input_ids
>>> input_tokens = processor.tokenizer.batch_decode(input_ids)
>>> print(input_tokens)
['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
```
Args:
text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
Wherever an image token, `<image>` is encountered it is expanded to
`<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
image_seq_len (`int`, *optional*):
The length of the image sequence. If not provided, the default value is used.
padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding/truncation length. See
[`PreTrainedTokenizerFast.__call__`] for more information.
is_split_into_words (`bool`, *optional*, defaults to `False`):
Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
tokenization process and assume the input is already tokenized.
add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
return_tensors (`Union[str, TensorType]`, *optional*):
If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
information.
"""
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
n_images_in_text = []
inputs = BatchFeature()
if text is not None:
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
fake_image_token = self.fake_image_token.content
image_token = self.image_token.content
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
if self.image_processor.do_image_splitting:
# A single image token is split into 4 patches + 1 original image
image_str = image_str * 5
prompt_strings = []
for sample in text:
n_images_in_text.append(sample.count(image_token))
sample = sample.replace(image_token, image_str)
# Remove any double fake tokens if images are adjacent
sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
prompt_strings.append(sample)
text_inputs = self.tokenizer(
text=prompt_strings,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
is_split_into_words=is_split_into_words,
return_tensors=return_tensors,
)
inputs.update(text_inputs)
if images is not None:
if is_image_or_image_url(images):
images = [[images]]
elif isinstance(images, list) and is_image_or_image_url(images[0]):
images = [images]
elif (
not isinstance(images, list)
and not isinstance(images[0], list)
and not is_image_or_image_url(images[0][0])
):
raise ValueError(
"Invalid input images. Please provide a single image or a list of images or a list of list of images."
)
n_images_in_images = [len(sample) for sample in images]
if text is not None and not n_images_in_images == n_images_in_text:
raise ValueError(
f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
)
# Load images if they are URLs
images = [[load_image(im) for im in sample] for sample in images]
image_inputs = self.image_processor(images, return_tensors=return_tensors)
inputs.update(image_inputs)
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
def apply_chat_template(
self,
conversation: Union[List[Dict[str, str]], "Conversation"],
chat_template: Optional[str] = None,
tokenize: bool = False,
**kwargs,
) -> str:
"""
Overrides the tokenizer's `apply_chat_template` method to apply the IDEFICS2 chat template by default
if no chat template is provided.
By default, the output isn't tokenized. This is because the IDEFICS2 chat template is designed to insert
the image token <image> into the sequence according to the message, but does not handle expanding the image
tokens to the sequence length or adding the surrounding tokens e.g. <fake_image_token>.
Args:
conversation (`Union[List[Dict, str, str], "Conversation"]`):
The conversation to format.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template
is used.
tokenize (`bool`, *optional*, defaults to `False`):
Whether to tokenize the output or not.
**kwargs:
Additional keyword arguments for the tokenizer's `apply_chat_template` method.
"""
if chat_template is None:
if self.chat_template is not None:
chat_template = self.chat_template
else:
chat_template = self.default_chat_template
return self.tokenizer.apply_chat_template(
conversation, chat_template=chat_template, tokenize=tokenize, **kwargs
)
@property
def default_chat_template(self):
"""
This template formats inputs in the form of a chat history. For each message in the chat history:
* the template will output the role of the speaker followed by the content of the message.
* content can be a single string or a list of strings and images.
* If the content element is an image, the template will output a sequence of <image> tokens and <fake_token_around_image> token before and after each image
* The template will output an <end_of_utterance> token at the end of each message.
Example:
```python
messages = [{
"role": "user",
"content": [
{"type": "text", "text": "What’s in this image?"},
{"type": "image"},
{"type": "image"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},]
}]
```
Will create outputs like:
```
User: What is in this Image?<image><image><end_of_utterance>
Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>
```
"""
# fmt: off
return (
"{% for message in messages %}"
"{{message['role'].capitalize()}}"
"{% if message['content'][0]['type'] == 'image' %}"
"{{':'}}"
"{% else %}"
"{{': '}}"
"{% endif %}"
"{% for line in message['content'] %}"
"{% if line['type'] == 'text' %}"
"{{line['text']}}"
"{% elif line['type'] == 'image' %}"
"{{ '<image>' }}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ 'Assistant:' }}"
"{% endif %}"
)
# fmt: on
...@@ -4405,6 +4405,37 @@ class IdeficsProcessor(metaclass=DummyObject): ...@@ -4405,6 +4405,37 @@ class IdeficsProcessor(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST = None
class Idefics2ForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Idefics2Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Idefics2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Idefics2Processor(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -261,6 +261,13 @@ class IdeficsImageProcessor(metaclass=DummyObject): ...@@ -261,6 +261,13 @@ class IdeficsImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class Idefics2ImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class ImageGPTFeatureExtractor(metaclass=DummyObject): class ImageGPTFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
if is_vision_available():
from PIL import Image
from transformers import Idefics2ImageProcessor
if is_torch_available():
import torch
class Idefics2ImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_images=1,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
do_pad=True,
do_image_splitting=True,
):
size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_images = num_images
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_convert_rgb = do_convert_rgb
self.do_pad = do_pad
self.do_image_splitting = do_image_splitting
def prepare_image_processor_dict(self):
return {
"do_convert_rgb": self.do_convert_rgb,
"do_resize": self.do_resize,
"size": self.size,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_pad": self.do_pad,
"do_image_splitting": self.do_image_splitting,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
assuming do_resize is set to True with a scalar size and size_divisor.
"""
if not batched:
shortest_edge = self.size["shortest_edge"]
longest_edge = self.size["longest_edge"]
image = image_inputs[0]
if isinstance(image, Image.Image):
w, h = image.size
else:
h, w = image.shape[1], image.shape[2]
aspect_ratio = w / h
if w > h and w >= longest_edge:
w = longest_edge
h = int(w / aspect_ratio)
elif h > w and h >= longest_edge:
h = longest_edge
w = int(h * aspect_ratio)
w = max(w, shortest_edge)
h = max(h, shortest_edge)
expected_height = h
expected_width = w
else:
expected_values = []
for images in image_inputs:
for image in images:
expected_height, expected_width = self.get_expected_values([image])
expected_values.append((expected_height, expected_width))
expected_height = max(expected_values, key=lambda item: item[0])[0]
expected_width = max(expected_values, key=lambda item: item[1])[1]
return expected_height, expected_width
def expected_output_image_shape(self, images):
height, width = self.get_expected_values(images, batched=True)
effective_nb_images = self.num_images * 5 if self.do_image_splitting else 1
return effective_nb_images, self.num_channels, height, width
def prepare_image_inputs(
self,
batch_size=None,
min_resolution=None,
max_resolution=None,
num_channels=None,
num_images=None,
size_divisor=None,
equal_resolution=False,
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
min_resolution = min_resolution if min_resolution is not None else self.min_resolution
max_resolution = max_resolution if max_resolution is not None else self.max_resolution
num_channels = num_channels if num_channels is not None else self.num_channels
num_images = num_images if num_images is not None else self.num_images
images_list = []
for i in range(batch_size):
images = []
for j in range(num_images):
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
return images_list
@require_torch
@require_vision
class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = Idefics2ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Idefics2 model."""
import copy
import gc
import unittest
from io import BytesIO
import requests
from transformers import (
AutoProcessor,
Idefics2Config,
Idefics2ForConditionalGeneration,
Idefics2Model,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
else:
is_torch_greater_or_equal_than_2_0 = False
if is_vision_available():
from PIL import Image
class Idefics2VisionText2TextModelTester:
def __init__(
self,
parent,
is_training=True,
batch_size=2,
num_images=2,
seq_length=10,
vision_config={
"image_size": 12,
"patch_size": 12,
"num_channels": 3,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 32,
"dropout": 0.1,
"attention_dropout": 0.1,
"initializer_range": 0.02,
},
perceiver_config={
"hidden_act": "silu",
"resampler_n_latents": 2,
"resampler_depth": 2,
"resampler_n_heads": 2,
"num_key_value_heads": 1,
"resampler_head_dim": 12,
"attention_dropout": 0.0,
},
text_config={
"vocab_size": 100,
"hidden_size": 64,
"intermediate_size": 56,
"num_hidden_layers": 3,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 256,
"initializer_range": 0.02,
"rms_norm_eps": 1e-6,
"pad_token_id": 0, # None in the original configuration_mistral, we set it to the unk_token_id
"bos_token_id": 1,
"eos_token_id": 2,
"image_token_id": 32_001,
"tie_word_embeddings": False,
"rope_theta": 10000.0,
"sliding_window": 32,
"attention_dropout": 0.0,
},
use_cache=False,
tie_word_embeddings=False,
image_token_id=99,
):
self.parent = parent
self.is_training = is_training
self.batch_size = batch_size
self.num_images = num_images
self.num_channels = 3
self.seq_length = seq_length
self.use_cache = use_cache
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
# Hack - add properties here so use common tests
self.vocab_size = text_config["vocab_size"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.num_attention_heads = text_config["num_attention_heads"]
self.hidden_size = text_config["hidden_size"]
self.vision_config = vision_config
self.perceiver_config = perceiver_config
self.text_config = text_config
def get_config(self):
return Idefics2Config(
use_cache=self.use_cache,
image_token_id=self.image_token_id,
tie_word_embeddings=self.tie_word_embeddings,
vision_config=self.vision_config,
perceiver_config=self.perceiver_config,
text_config=self.text_config,
vocab_size=self.vocab_size,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.num_images,
self.vision_config["num_channels"],
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
config = self.get_config()
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
# For simplicity just set the last n tokens to the image token
n_image_tokens_per_batch = self.num_images * self.perceiver_config["resampler_n_latents"]
input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
attention_mask = input_ids.ne(1).to(torch_device)
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `Idefics2`.
"""
all_model_classes = (Idefics2Model,) if is_torch_available() else ()
fx_compatible = False
test_torchscript = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
def setUp(self):
self.model_tester = Idefics2VisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
@unittest.skip("input_embeds cannot be passed in without input_ids")
def test_inputs_embeds():
pass
@unittest.skip("Model does not support padding right")
def test_flash_attn_2_generate_padding_right(self):
pass
@unittest.skip("Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
if self.model_tester.is_training is False:
model.eval()
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Ignore copy
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# make sure that decoder_input_ids are resized as well
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
# if no output embeddings -> leave test
if model.get_output_embeddings() is None:
continue
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
"""
Model tester for `Idefics2ForConditionalGeneration`.
"""
all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = Idefics2VisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
@unittest.skip("input_embeds cannot be passed in without input_ids")
def test_inputs_embeds():
pass
@unittest.skip("Model does not support padding right")
def test_flash_attn_2_generate_padding_right(self):
pass
@unittest.skip("Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
self.image1 = Image.open(
BytesIO(
requests.get(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
).content
)
)
self.image2 = Image.open(
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
)
self.image3 = Image.open(
BytesIO(
requests.get(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
).content
)
)
def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
@slow
def test_integration_test(self):
model = Idefics2ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/idefics2-8b-base",
torch_dtype=torch.bfloat16,
device_map="auto",
)
model.to(torch_device)
# Create inputs
text = "<image>In this image, we see"
images = self.image1
inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True)
inputs.to(torch_device)
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
# Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and']
expected_generated_text = "In this image, we see the Statue of Liberty, the New York City"
self.assertEqual(generated_texts[0], expected_generated_text)
@slow
@require_bitsandbytes
def test_integration_test_4bit(self):
# Let' s make sure we test the preprocessing to replace what is used
model = Idefics2ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/idefics2-8b-base", load_in_4bit=True, device_map="auto"
)
# Create pixel inputs
text = ["<image>In this image, we see", "bla, bla <image><image>"]
images = [[self.image1], [self.image2, self.image3]]
inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River,"
self.assertEqual(generated_texts[0], expected_generated_text)
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from io import BytesIO
import requests
from transformers import Idefics2Processor
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
if is_vision_available():
from PIL import Image
@require_torch
@require_vision
class Idefics2ProcessorTest(unittest.TestCase):
def setUp(self):
self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
self.image1 = Image.open(
BytesIO(
requests.get(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
).content
)
)
self.image2 = Image.open(
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
)
self.image3 = Image.open(
BytesIO(
requests.get(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
).content
)
)
self.bos_token = self.processor.tokenizer.bos_token
self.image_token = self.processor.image_token.content
self.fake_image_token = self.processor.fake_image_token.content
self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
self.image_seq_len = self.processor.image_seq_len
def test_process_interleaved_images_prompts_no_image_splitting(self):
old_image_splitting = self.processor.image_processor.do_image_splitting
self.processor.image_processor.do_image_splitting = False
# Test that a single image is processed correctly
inputs = self.processor(images=self.image1)
self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
# fmt: on
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = self.processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "bla, bla"
text = [
image_str + text_str_1,
text_str_2 + image_str + image_str,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = self.processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 767, 980))
self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
# fmt: on
self.processor.image_processor.do_image_splitting = old_image_splitting
def test_process_interleaved_images_prompts_image_splitting(self):
old_image_splitting = self.processor.image_processor.do_image_splitting
self.processor.image_processor.do_image_splitting = True
# Test that a single image is processed correctly
inputs = self.processor(images=self.image1)
self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
# fmt: on
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = self.processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "bla, bla"
text = [
image_str + text_str_1,
text_str_2 + image_str + image_str,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = self.processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(inputs['pixel_values'].shape, (2, 10, 3, 767, 980))
self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
# fmt: on
self.processor.image_processor.do_image_splitting = old_image_splitting
def test_add_special_tokens_processor(self):
image_str = "<image>"
text_str = "In this image, we see"
text = text_str + image_str
n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
# fmt: off
inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
inputs = self.processor(text=text, images=self.image1)
expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
# fmt: on
def test_apply_chat_template(self):
# Message contains content which a mix of lists with images and image urls and string
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What do these images show?"},
{"type": "image"},
{"type": "image"},
"What do these images show?",
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
}
],
},
{"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
]
processor = self.processor
# Make short sequence length to test that the fake tokens are added correctly
rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
expected_rendered = (
"User: What do these images show?<image><image><end_of_utterance>\n"
"Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
"User: And who is that?<end_of_utterance>\n"
"Assistant:"
)
self.assertEqual(rendered, expected_rendered)
...@@ -60,6 +60,7 @@ from transformers.models.auto.modeling_auto import ( ...@@ -60,6 +60,7 @@ from transformers.models.auto.modeling_auto import (
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
MODEL_MAPPING_NAMES, MODEL_MAPPING_NAMES,
) )
from transformers.testing_utils import ( from transformers.testing_utils import (
...@@ -220,6 +221,7 @@ class ModelTesterMixin: ...@@ -220,6 +221,7 @@ class ModelTesterMixin:
*get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES), *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES),
*get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES), *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
*get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES), *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
*get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
]: ]:
inputs_dict["labels"] = torch.zeros( inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment