Unverified Commit 868363ab authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add InstructBLIP (#23460)



* Squash 88 commits

* Use markdown

* Remove mdx files due to bad rebase

* Fix modeling files due to bad rebase

* Fix style

* Update comment

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 8e164c54
......@@ -18,6 +18,7 @@ Processor class for BLIP-2.
from typing import List, Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
......@@ -49,7 +50,7 @@ class Blip2Processor(ProcessorMixin):
# Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
def __call__(
self,
images=None,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
......
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_instructblip": [
"INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"InstructBlipConfig",
"InstructBlipQFormerConfig",
"InstructBlipVisionConfig",
],
"processing_instructblip": ["InstructBlipProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_instructblip"] = [
"INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"InstructBlipQFormerModel",
"InstructBlipPreTrainedModel",
"InstructBlipForConditionalGeneration",
"InstructBlipVisionModel",
]
if TYPE_CHECKING:
from .configuration_instructblip import (
INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
InstructBlipConfig,
InstructBlipQFormerConfig,
InstructBlipVisionConfig,
)
from .processing_instructblip import InstructBlipProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_instructblip import (
INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
InstructBlipForConditionalGeneration,
InstructBlipPreTrainedModel,
InstructBlipQFormerModel,
InstructBlipVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" InstructBLIP model configuration"""
import copy
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Salesforce/instruct-blip-flan-t5": "https://huggingface.co/Salesforce/instruct-blip-flan-t5/resolve/main/config.json",
}
class InstructBlipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to
instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture.
Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP
[Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 1408):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 6144):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 39):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 14):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer
normalization layers.
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 1e-10):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries and values in the self-attention layers.
Example:
```python
>>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel
>>> # Initializing a InstructBlipVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration
>>> configuration = InstructBlipVisionConfig()
>>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
>>> model = InstructBlipVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "instructblip_vision_model"
def __init__(
self,
hidden_size=1408,
intermediate_size=6144,
num_hidden_layers=39,
num_attention_heads=16,
image_size=224,
patch_size=14,
hidden_act="gelu",
layer_norm_eps=1e-6,
attention_dropout=0.0,
initializer_range=1e-10,
qkv_bias=True,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.qkv_bias = qkv_bias
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the vision config dict if we are loading from InstructBlipConfig
if config_dict.get("model_type") == "instructblip":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class InstructBlipQFormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`InstructBlipQFormerModel`]. It is used to
instantiate a InstructBLIP Querying Transformer (Q-Former) model according to the specified arguments, defining the
model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
Read the documentation from [`PretrainedConfig`] for more information.
Note that [`InstructBlipQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
Args:
vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling the model.
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
cross_attention_frequency (`int`, *optional*, defaults to 2):
The frequency of adding cross-attention to the Transformer layers.
encoder_hidden_size (`int`, *optional*, defaults to 1408):
The hidden size of the hidden states for cross-attention.
Examples:
```python
>>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel
>>> # Initializing a InstructBLIP Salesforce/instruct-blip-flan-t5 style configuration
>>> configuration = InstructBlipQFormerConfig()
>>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
>>> model = InstructBlipQFormerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "instructblip_qformer"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
position_embedding_type="absolute",
cross_attention_frequency=2,
encoder_hidden_size=1408,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.cross_attention_frequency = cross_attention_frequency
self.encoder_hidden_size = encoder_hidden_size
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the qformer config dict if we are loading from InstructBlipConfig
if config_dict.get("model_type") == "instructblip":
config_dict = config_dict["qformer_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class InstructBlipConfig(PretrainedConfig):
r"""
[`InstructBlipConfig`] is the configuration class to store the configuration of a
[`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified
arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
the defaults will yield a similar configuration to that of the InstructBLIP
[Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`].
qformer_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
num_query_tokens (`int`, *optional*, defaults to 32):
The number of query tokens passed through the Transformer.
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```python
>>> from transformers import (
... InstructBlipVisionConfig,
... InstructBlipQFormerConfig,
... OPTConfig,
... InstructBlipConfig,
... InstructBlipForConditionalGeneration,
... )
>>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration
>>> configuration = InstructBlipConfig()
>>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
>>> model = InstructBlipForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig
>>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
>>> vision_config = InstructBlipVisionConfig()
>>> qformer_config = InstructBlipQFormerConfig()
>>> text_config = OPTConfig()
>>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
```"""
model_type = "instructblip"
is_composition = True
def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
super().__init__(**kwargs)
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")
if qformer_config is None:
qformer_config = {}
logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
self.vision_config = InstructBlipVisionConfig(**vision_config)
self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
self.tie_word_embeddings = self.text_config.tie_word_embeddings
self.is_encoder_decoder = self.text_config.is_encoder_decoder
self.num_query_tokens = num_query_tokens
self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
self.initializer_factor = 1.0
self.initializer_range = 0.02
@classmethod
def from_vision_qformer_text_configs(
cls,
vision_config: InstructBlipVisionConfig,
qformer_config: InstructBlipQFormerConfig,
text_config: PretrainedConfig,
**kwargs,
):
r"""
Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and
language model configurations.
Returns:
[`InstructBlipConfig`]: An instance of a configuration object
"""
return cls(
vision_config=vision_config.to_dict(),
qformer_config=qformer_config.to_dict(),
text_config=text_config.to_dict(),
**kwargs,
)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["vision_config"] = self.vision_config.to_dict()
output["qformer_config"] = self.qformer_config.to_dict()
output["text_config"] = self.text_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert InstructBLIP checkpoints from the original repository.
URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
"""
import argparse
import requests
import torch
# pip3 install salesforce-lavis
# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
# same for Vicuna-13b
from lavis.models import load_model_and_preprocess
from PIL import Image
from transformers import (
AutoTokenizer,
BlipImageProcessor,
InstructBlipConfig,
InstructBlipForConditionalGeneration,
InstructBlipProcessor,
InstructBlipQFormerConfig,
InstructBlipVisionConfig,
LlamaConfig,
LlamaTokenizerFast,
T5Config,
T5TokenizerFast,
)
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
def load_demo_image():
url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
return image
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# fmt: off
# vision encoder
rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
for i in range(config.vision_config.num_hidden_layers):
rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
# QFormer
rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_q_v_bias(state_dict, config):
for i in range(config.vision_config.num_hidden_layers):
# read in original q and v biases
q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
# next, set bias in the state dict
qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
def get_blip2_config(model_name):
image_size = 364 if "coco" in model_name else 224
vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()
# make sure the models have proper bos_token_id and eos_token_id set (important for generation)
# seems like flan-T5 models don't have bos_token_id properly set?
if "t5-xl" in model_name:
text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
elif "t5-xxl" in model_name:
text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
elif "vicuna-7b" in model_name:
text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
elif "vicuna-13b" in model_name:
text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
else:
raise ValueError("Model name not supported")
# the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)
return config, image_size
@torch.no_grad()
def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
"""
Copy/paste/tweak model's weights to Transformers design.
"""
qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
if "t5" in model_name:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
elif "vicuna" in model_name:
# the following was used in the original implementation:
# tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
# tokenizer.add_special_tokens({"pad_token": "[PAD]"})
# tokenizer.add_special_tokens({"bos_token": "</s>"})
# tokenizer.add_special_tokens({"eos_token": "</s>"})
# tokenizer.add_special_tokens({"unk_token": "</s>"})
tokenizer = LlamaTokenizerFast.from_pretrained(
"huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
config, image_size = get_blip2_config(model_name)
hf_model = InstructBlipForConditionalGeneration(config).eval()
model_name_to_original = {
"instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
"instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
"instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
"instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
}
name, type = model_name_to_original[model_name]
# load original model
print("Loading original model...")
hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
original_model, vis_processors, _ = load_model_and_preprocess(
name=name, model_type=type, is_eval=True, device=lavis_device
)
original_model.eval()
print("Done!")
# update state dict keys
state_dict = original_model.state_dict()
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
# some keys can be renamed efficiently
for key, val in state_dict.copy().items():
val = state_dict.pop(key)
if key.startswith("Qformer.bert"):
key = key.replace("Qformer.bert", "qformer")
if "attention.self" in key:
key = key.replace("self", "attention")
if "llm_proj" in key:
key = key.replace("llm_proj", "language_projection")
if "t5_proj" in key:
key = key.replace("t5_proj", "language_projection")
if key.startswith("llm_model"):
key = key.replace("llm_model", "language_model")
if key.startswith("t5"):
key = key.replace("t5", "language")
state_dict[key] = val
# read in qv biases
read_in_q_v_bias(state_dict, config)
# note: weights get loaded in torch.float32 by default
hf_model.load_state_dict(state_dict, strict=True)
image = load_demo_image()
prompt = "What is unusual about this image?"
# create processor
image_processor = BlipImageProcessor(
size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
)
processor = InstructBlipProcessor(
image_processor=image_processor,
tokenizer=tokenizer,
qformer_tokenizer=qformer_tokenizer,
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
# make sure processor creates exact same pixel values
original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
pixel_values = inputs.pixel_values
assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
original_model.to(lavis_device)
hf_model.to(hf_model_device)
with torch.no_grad():
if "vicuna" in model_name:
original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
logits = hf_model(**inputs).logits
else:
original_logits = original_model(
{"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
).logits
label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
logits = hf_model(**inputs, labels=labels).logits
print("First values of original logits:", original_logits[0, :3, :3])
print("First values of HF logits:", logits[0, :3, :3])
# assert values
assert original_logits.shape == logits.shape
atol = 1e-4 if "vicuna" in model_name else 1e-5
assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
print("Looks ok!")
print("Generating with original model...")
original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
# important: we need to cast the weights of the HF model to the appropriate type
print("Generating with HF model...")
outputs = hf_model.generate(
**inputs,
do_sample=False,
num_beams=5,
max_length=256,
min_length=1,
top_p=0.9,
repetition_penalty=1.5,
length_penalty=1.0,
temperature=1,
)
if "vicuna" in model_name:
# convert output id 0 to 2 (eos_token_id)
# TODO add this in the generate method?
outputs[outputs == 0] = 2
print("Original generation:", original_outputs)
output_text = processor.batch_decode(outputs, skip_special_tokens=True)
output_text = [text.strip() for text in output_text]
print("HF generation:", output_text)
if pytorch_dump_folder_path is not None:
processor.save_pretrained(pytorch_dump_folder_path)
hf_model.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
processor.push_to_hub(f"Salesforce/{model_name}")
hf_model.push_to_hub(f"Salesforce/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
choices = [
"instructblip-vicuna-7b",
"instructblip-vicuna-13b",
"instructblip-flan-t5-xl",
"instructblip-flan-t5-xxl",
]
parser.add_argument(
"--model_name",
default="instructblip-flan-t5-xl",
choices=choices,
type=str,
help="Path to hf config.json of model to convert",
)
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model and processor to the hub after converting",
)
args = parser.parse_args()
convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
This diff is collapsed.
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
"""
import os
from typing import List, Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
from ..auto import AutoTokenizer
class InstructBlipProcessor(ProcessorMixin):
r"""
Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
processor.
[`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
Args:
image_processor (`BlipImageProcessor`):
An instance of [`BlipImageProcessor`]. The image processor is a required input.
tokenizer (`AutoTokenizer`):
An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
qformer_tokenizer (`AutoTokenizer`):
An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "BlipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer, qformer_tokenizer):
super().__init__(image_processor, tokenizer)
# add QFormer tokenizer
self.qformer_tokenizer = qformer_tokenizer
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchEncoding:
"""
This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
if images is None and text is None:
raise ValueError("You have to specify at least images or text.")
encoding = BatchEncoding()
if text is not None:
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
encoding.update(text_encoding)
qformer_text_encoding = self.qformer_tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
if images is not None:
image_encoding = self.image_processor(images, return_tensors=return_tensors)
encoding.update(image_encoding)
return encoding
# Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
# Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
# overwrite to save the Q-Former tokenizer in a separate folder
def save_pretrained(self, save_directory, **kwargs):
if os.path.isfile(save_directory):
raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
return super().save_pretrained(save_directory, **kwargs)
# overwrite to load the Q-Former tokenizer from a separate folder
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
args.append(qformer_tokenizer)
return cls(*args)
......@@ -3729,6 +3729,37 @@ class InformerPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class InstructBlipForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipQFormerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class InstructBlipVisionModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -330,9 +330,7 @@ class Blip2TextModelDecoderOnlyTester:
def prepare_config_and_inputs(self):
config = self.get_config()
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
3,
)
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
input_ids[:, -1] = self.eos_token_id # Eos Token
attention_mask = input_ids.ne(self.pad_token_id)
......
This diff is collapsed.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
if is_vision_available():
from PIL import Image
from transformers import (
AutoProcessor,
BertTokenizerFast,
BlipImageProcessor,
GPT2Tokenizer,
InstructBlipProcessor,
PreTrainedTokenizerFast,
)
@require_vision
class InstructBlipProcessorTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = BlipImageProcessor()
tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")
processor = InstructBlipProcessor(image_processor, tokenizer, qformer_tokenizer)
processor.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_qformer_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def prepare_image_inputs(self):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
"""
image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
return image_inputs
def test_save_load_pretrained_additional_features(self):
processor = InstructBlipProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
qformer_tokenizer=self.get_qformer_tokenizer(),
)
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = InstructBlipProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, BlipImageProcessor)
self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
qformer_tokenizer = self.get_qformer_tokenizer()
processor = InstructBlipProcessor(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
image_input = self.prepare_image_inputs()
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
qformer_tokenizer = self.get_qformer_tokenizer()
processor = InstructBlipProcessor(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
input_str = "lower newer"
encoded_processor = processor(text=input_str)
encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
for key in encoded_tokens.keys():
self.assertListEqual(encoded_tokens[key], encoded_processor[key])
for key in encoded_tokens_qformer.keys():
self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
qformer_tokenizer = self.get_qformer_tokenizer()
processor = InstructBlipProcessor(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(
list(inputs.keys()),
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
)
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
def test_tokenizer_decode(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
qformer_tokenizer = self.get_qformer_tokenizer()
processor = InstructBlipProcessor(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
qformer_tokenizer = self.get_qformer_tokenizer()
processor = InstructBlipProcessor(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(
list(inputs.keys()),
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
)
......@@ -57,6 +57,7 @@ PRIVATE_MODELS = [
# Being in this list is an exception and should **not** be the rule.
IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
# models to ignore for not tested
"InstructBlipQFormerModel", # Building part of bigger (tested) model.
"NllbMoeDecoder",
"NllbMoeEncoder",
"LlamaDecoder", # Building part of bigger (tested) model.
......@@ -282,6 +283,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"FlavaMultimodalModel",
"GPT2DoubleHeadsModel",
"GPTSw3DoubleHeadsModel",
"InstructBlipVisionModel",
"InstructBlipQFormerModel",
"LayoutLMForQuestionAnswering",
"LukeForMaskedLM",
"LukeForEntityClassification",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment