Unverified Commit 8f2cc1c3 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Add TFCLIPModel (#13967)



* Start the work for TFCLIPModel

* Convert to TF code (TODO: loss + doc)

* Clean up

* Fix pooled_output for TFCLIPTextTransformer - using tf.gather_nd

* assert -> raise error

* Expose TFCLIPModel

* Deal with dummy_inputs

* Add tests

* Fix all tests. TODO: manual check weight loading + add more comments

* Fix pt tf equivalence test

* fixes

* update TFCLIPVisionEmbeddings's Conv2D

* Fix loss + overwrite test_pt_tf_model_equivalence from common

* Add a comment about the change about MainLayer in test_keras_save_load

* Set return_loss=True in TFCLIPModelTester + make tests pass

* overwrite test_pt_tf_model_equivalence from tf common

* fix base_model_prefix

* Fix examples

* remove unused

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply review suggestions

* change self.pre_layrnorm to self.pre_layernorm

* apply more review suggestions

* return attention probs before dropout (to align with PT)

* fix weight init

* fix

* build doc

* fix missing doc

* fix for test
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 2d30443c
...@@ -202,7 +202,7 @@ Flax), PyTorch, and/or TensorFlow. ...@@ -202,7 +202,7 @@ Flax), PyTorch, and/or TensorFlow.
| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | | BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ |
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | | Canine | ✅ | ❌ | ✅ | ❌ | ❌ |
| CLIP | ✅ | ✅ | ✅ | | ✅ | | CLIP | ✅ | ✅ | ✅ | | ✅ |
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | | CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | | DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
......
...@@ -125,6 +125,23 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o ...@@ -125,6 +125,23 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o
[[autodoc]] CLIPVisionModel [[autodoc]] CLIPVisionModel
- forward - forward
## TFCLIPModel
[[autodoc]] TFCLIPModel
- call
- get_text_features
- get_image_features
## TFCLIPTextModel
[[autodoc]] TFCLIPTextModel
- call
## TFCLIPVisionModel
[[autodoc]] TFCLIPVisionModel
- call
## FlaxCLIPModel ## FlaxCLIPModel
[[autodoc]] FlaxCLIPModel [[autodoc]] FlaxCLIPModel
......
...@@ -1549,6 +1549,15 @@ if is_tf_available(): ...@@ -1549,6 +1549,15 @@ if is_tf_available():
"TFCamembertModel", "TFCamembertModel",
] ]
) )
_import_structure["models.clip"].extend(
[
"TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCLIPModel",
"TFCLIPPreTrainedModel",
"TFCLIPTextModel",
"TFCLIPVisionModel",
]
)
_import_structure["models.convbert"].extend( _import_structure["models.convbert"].extend(
[ [
"TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
...@@ -3394,6 +3403,13 @@ if TYPE_CHECKING: ...@@ -3394,6 +3403,13 @@ if TYPE_CHECKING:
TFCamembertForTokenClassification, TFCamembertForTokenClassification,
TFCamembertModel, TFCamembertModel,
) )
from .models.clip import (
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCLIPModel,
TFCLIPPreTrainedModel,
TFCLIPTextModel,
TFCLIPVisionModel,
)
from .models.convbert import ( from .models.convbert import (
TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFConvBertForMaskedLM, TFConvBertForMaskedLM,
......
...@@ -63,6 +63,12 @@ def gelu_fast(x): ...@@ -63,6 +63,12 @@ def gelu_fast(x):
return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x))) return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
def quick_gelu(x):
x = tf.convert_to_tensor(x)
coeff = tf.cast(1.702, x.dtype)
return x * tf.math.sigmoid(coeff * x)
if version.parse(tf.version.VERSION) >= version.parse("2.4"): if version.parse(tf.version.VERSION) >= version.parse("2.4"):
def approximate_gelu_wrap(x): def approximate_gelu_wrap(x):
...@@ -84,6 +90,7 @@ ACT2FN = { ...@@ -84,6 +90,7 @@ ACT2FN = {
"mish": mish, "mish": mish,
"tanh": tf.keras.activations.tanh, "tanh": tf.keras.activations.tanh,
"gelu_fast": gelu_fast, "gelu_fast": gelu_fast,
"quick_gelu": quick_gelu,
} }
......
...@@ -29,6 +29,7 @@ logger = logging.get_logger(__name__) ...@@ -29,6 +29,7 @@ logger = logging.get_logger(__name__)
TF_MODEL_MAPPING_NAMES = OrderedDict( TF_MODEL_MAPPING_NAMES = OrderedDict(
[ [
# Base model mapping # Base model mapping
("clip", "TFCLIPModel"),
("deberta-v2", "TFDebertaV2Model"), ("deberta-v2", "TFDebertaV2Model"),
("deberta", "TFDebertaModel"), ("deberta", "TFDebertaModel"),
("rembert", "TFRemBertModel"), ("rembert", "TFRemBertModel"),
......
...@@ -20,6 +20,7 @@ from typing import TYPE_CHECKING ...@@ -20,6 +20,7 @@ from typing import TYPE_CHECKING
from ...file_utils import ( from ...file_utils import (
_LazyModule, _LazyModule,
is_flax_available, is_flax_available,
is_tf_available,
is_tokenizers_available, is_tokenizers_available,
is_torch_available, is_torch_available,
is_vision_available, is_vision_available,
...@@ -47,6 +48,15 @@ if is_torch_available(): ...@@ -47,6 +48,15 @@ if is_torch_available():
"CLIPVisionModel", "CLIPVisionModel",
] ]
if is_tf_available():
_import_structure["modeling_tf_clip"] = [
"TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCLIPModel",
"TFCLIPPreTrainedModel",
"TFCLIPTextModel",
"TFCLIPVisionModel",
]
if is_flax_available(): if is_flax_available():
_import_structure["modeling_flax_clip"] = [ _import_structure["modeling_flax_clip"] = [
"FlaxCLIPModel", "FlaxCLIPModel",
...@@ -78,6 +88,15 @@ if TYPE_CHECKING: ...@@ -78,6 +88,15 @@ if TYPE_CHECKING:
CLIPVisionModel, CLIPVisionModel,
) )
if is_tf_available():
from .modeling_tf_clip import (
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCLIPModel,
TFCLIPPreTrainedModel,
TFCLIPTextModel,
TFCLIPVisionModel,
)
if is_flax_available(): if is_flax_available():
from .modeling_flax_clip import ( from .modeling_flax_clip import (
FlaxCLIPModel, FlaxCLIPModel,
......
# coding=utf-8
# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 CLIP model. """
import math
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...file_utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
# Public API
from ...modeling_tf_utils import (
DUMMY_INPUTS,
TFModelInputType,
TFPreTrainedModel,
get_initializer,
input_processing,
keras_serializable,
shape_list,
)
from ...utils import logging
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai/clip-vit-base-patch32",
# See all CLIP models at https://huggingface.co/models?filter=clip
]
LARGE_NEGATIVE = -1e8
# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
# contrastive loss function, adapted from
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(tf.transpose(similarity))
return (caption_loss + image_loss) / 2.0
@dataclass
class TFCLIPOutput(ModelOutput):
"""
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the
image-text similarity scores.
logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the
text-image similarity scores.
text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
[`TFCLIPTextModel`].
image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`TFCLIPVisionModel`].
text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
The output of the [`TFCLIPTextModel`].
vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
The output of the [`TFCLIPVisionModel`].
"""
loss: Optional[tf.Tensor] = None
logits_per_image: tf.Tensor = None
logits_per_text: tf.Tensor = None
text_embeds: tf.Tensor = None
image_embeds: tf.Tensor = None
text_model_output: TFBaseModelOutputWithPooling = None
vision_model_output: TFBaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.config = config
self.patch_embedding = tf.keras.layers.Conv2D(
filters=self.embed_dim,
kernel_size=self.patch_size,
strides=self.patch_size,
padding="valid",
data_format="channels_last",
use_bias=False,
kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor),
name="patch_embedding",
)
def build(self, input_shape: tf.TensorShape):
factor = self.config.initializer_factor
self.class_embedding = self.add_weight(
shape=(self.embed_dim,),
initializer=get_initializer(self.embed_dim ** -0.5 * factor),
trainable=True,
name="class_embedding",
)
with tf.name_scope("position_embedding"):
self.position_embedding = self.add_weight(
shape=(self.num_positions, self.embed_dim),
initializer=get_initializer(self.config.initializer_range * factor),
trainable=True,
name="embeddings",
)
super().build(input_shape)
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
"""`pixel_values` is expected to be of NCHW format."""
batch_size, num_channels, height, width = shape_list(pixel_values)
# When running on CPU, `tf.nn.conv2d` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels=num_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
patch_embeds = self.patch_embedding(pixel_values)
# Change the 2D spatial dimensions to a single temporal dimension.
# shape = (batch_size, num_patches, out_channels=embed_dim)
patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1))
# add the [CLS] token to the embedded patch tokens
class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim))
embeddings = tf.concat((class_embeds, patch_embeds), axis=1)
embeddings = embeddings + self.position_embedding
return embeddings
class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.vocab_size = config.vocab_size
self.config = config
def build(self, input_shape: tf.TensorShape):
with tf.name_scope("token_embedding"):
self.weight = self.add_weight(
shape=(self.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="weight",
)
with tf.name_scope("position_embedding"):
self.position_embedding = self.add_weight(
shape=(self.config.max_position_embeddings, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="embeddings",
)
super().build(input_shape)
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
final_embeddings = inputs_embeds + position_embeds
return final_embeddings
class TFCLIPAttention(tf.keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = self.embed_dim // self.num_attention_heads
if self.attention_head_size * self.num_attention_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_attention_heads})."
)
factor = config.initializer_factor
in_proj_std = (self.embed_dim ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (self.embed_dim ** -0.5) * factor
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.q_proj = tf.keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
)
self.k_proj = tf.keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
)
self.v_proj = tf.keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout)
self.out_proj = tf.keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
)
# copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
# Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""Input shape: Batch x Time x Channel"""
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.q_proj(inputs=hidden_states)
mixed_key_layer = self.k_proj(inputs=hidden_states)
mixed_value_layer = self.v_proj(inputs=hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
# apply the causal_attention_mask first
if causal_attention_mask is not None:
# Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function)
attention_scores = tf.add(attention_scores, causal_attention_mask)
if attention_mask is not None:
# Apply the attention mask (precomputed for all layers in TFCLIPModel call() function)
attention_scores = tf.add(attention_scores, attention_mask)
# Normalize the attention scores to probabilities.
_attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(inputs=_attention_probs, training=training)
attention_output = tf.matmul(attention_probs, value_layer)
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
# (batch_size, seq_len_q, embed_dim)
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
attention_output = self.out_proj(attention_output, training=training)
# In TFBert, attention weights are returned after dropout.
# However, in CLIP, they are returned before dropout.
outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
return outputs
class TFCLIPMLP(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.activation_fn = get_tf_activation(config.hidden_act)
factor = config.initializer_factor
in_proj_std = (config.hidden_size ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * config.hidden_size) ** -0.5 * factor
self.fc1 = tf.keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
)
self.fc2 = tf.keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(inputs=hidden_states)
return hidden_states
class TFCLIPEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.self_attn = TFCLIPAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFCLIPMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""
Args:
hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
causal_attention_mask (`tf.Tensor`): causal attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`):
Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(inputs=hidden_states)
attention_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = attention_outputs[0]
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(inputs=hidden_states)
hidden_states = self.mlp(hidden_states=hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them
return outputs
class TFCLIPEncoder(tf.keras.layers.Layer):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`TFCLIPEncoderLayer`].
Args:
config: CLIPConfig
"""
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class TFCLIPTextTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm"
)
def call(
self,
input_ids: TFModelInputType,
attention_mask: tf.Tensor,
position_ids: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
input_shape = shape_list(input_ids)
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
batch_size, seq_length = input_shape
# CLIP's text model uses causal mask, prepare it here.
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
# check attention mask and invert
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
sequence_output = self.final_layer_norm(inputs=sequence_output)
# text_embeds.shape = [batch_size, n_ctx, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
pooled_output = tf.gather_nd(
params=sequence_output,
indices=tf.stack(
values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
),
)
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
diag = tf.constant(0.0, shape=(seq_length,), dtype=dtype)
# set an additive 2D attention mask with all places being masked
to_mask = tf.constant(-10000.0, shape=(seq_length, seq_length), dtype=dtype)
# set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
# TIP: think the 2D matrix as the space of (query_seq, key_seq)
to_mask = tf.linalg.band_part(to_mask, 0, -1)
# to_mask = tf.linalg.band_part(to_mask, -1, 0)
to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
@keras_serializable
class TFCLIPTextMainLayer(tf.keras.layers.Layer):
config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.text_model = TFCLIPTextTransformer(config, name="text_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.text_model.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.text_model.embeddings.weight = value
self.text_model.embeddings.vocab_size = shape_list(value)[0]
def call(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if inputs["input_ids"] is None:
raise ValueError("You have to specify either input_ids")
input_shape = shape_list(inputs["input_ids"])
if inputs["attention_mask"] is None:
inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
text_model_outputs = self.text_model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
return text_model_outputs
class TFCLIPVisionTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
def call(
self,
pixel_values: TFModelInputType,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
embedding_output = self.embeddings(pixel_values=pixel_values)
embedding_output = self.pre_layernorm(inputs=embedding_output)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=None,
causal_attention_mask=None,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
pooled_output = sequence_output[:, 0, :]
pooled_output = self.post_layernorm(inputs=pooled_output)
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@keras_serializable
class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
config_class = CLIPVisionConfig
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.vision_model.embeddings
def call(
self,
pixel_values: Optional[TFModelInputType] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if "input_ids" in inputs:
inputs["pixel_values"] = inputs.pop("input_ids")
if inputs["pixel_values"] is None:
raise ValueError("You have to specify pixel_values")
vision_model_outputs = self.vision_model(
pixel_values=inputs["pixel_values"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
return vision_model_outputs
@keras_serializable
class TFCLIPMainLayer(tf.keras.layers.Layer):
config_class = CLIPConfig
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
if not isinstance(config.text_config, CLIPTextConfig):
raise ValueError(
f"config.text_config is expected to be of type CLIPTextConfig but is of type {type(config.text_config)}."
)
if not isinstance(config.vision_config, CLIPVisionConfig):
raise ValueError(
f"config.vision_config is expected to be of type CLIPVisionConfig but is of type {type(config.vision_config)}."
)
self.config = config
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(vision_config.hidden_size ** -0.5 * self.config.initializer_factor),
use_bias=False,
name="visual_projection",
)
self.text_projection = tf.keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(text_config.hidden_size ** -0.5 * self.config.initializer_factor),
use_bias=False,
name="text_projection",
)
def build(self, input_shape: tf.TensorShape):
self.logit_scale = self.add_weight(
shape=(1,),
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
name="logit_scale",
)
super().build(input_shape)
def get_text_features(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> tf.Tensor:
inputs = input_processing(
func=self.get_text_features,
config=self.config,
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if inputs["input_ids"] is None:
raise ValueError("You have to specify either input_ids")
input_shape = shape_list(inputs["input_ids"])
if inputs["attention_mask"] is None:
inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
text_outputs = self.text_model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
pooled_output = text_outputs[1]
text_features = self.text_projection(inputs=pooled_output)
return text_features
def get_image_features(
self,
pixel_values: Optional[TFModelInputType] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> tf.Tensor:
inputs = input_processing(
func=self.get_image_features,
config=self.config,
input_ids=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if "input_ids" in inputs:
inputs["pixel_values"] = inputs.pop("input_ids")
if inputs["pixel_values"] is None:
raise ValueError("You have to specify pixel_values")
vision_outputs = self.vision_model(
pixel_values=inputs["pixel_values"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
pooled_output = vision_outputs[1] # pooled_output
image_features = self.visual_projection(inputs=pooled_output)
return image_features
def call(
self,
input_ids: Optional[TFModelInputType] = None,
pixel_values: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
position_ids=position_ids,
return_loss=return_loss,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if inputs["input_ids"] is None:
raise ValueError("You have to specify either input_ids")
if inputs["pixel_values"] is None:
raise ValueError("You have to specify pixel_values")
input_shape = shape_list(inputs["input_ids"])
if inputs["attention_mask"] is None:
inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
vision_outputs = self.vision_model(
pixel_values=inputs["pixel_values"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
text_outputs = self.text_model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(inputs=image_embeds)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(inputs=text_embeds)
# normalized features
image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True)
text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True)
# cosine similarity as logits
logit_scale = tf.math.exp(self.logit_scale)
logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
logits_per_image = tf.transpose(logits_per_text)
loss = None
if inputs["return_loss"]:
loss = clip_loss(logits_per_text)
if not inputs["return_dict"]:
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
return (loss,) + output if loss is not None else output
return TFCLIPOutput(
loss=loss,
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
class TFCLIPPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CLIPConfig
base_model_prefix = "clip"
CLIP_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
<Tip>
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
the first positional argument :
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
</Tip>
Args:
config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`BertTokenizer`]. See
[`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
CLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`BertTokenizer`]. See
[`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
[What are input IDs?](../glossary#input-ids)
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
class TFCLIPTextModel(TFCLIPPreTrainedModel):
config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.clip = TFCLIPTextMainLayer(config, name="clip")
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
def call(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
r"""
Returns:
Examples:
```python
>>> from transformers import CLIPTokenizer, TFCLIPTextModel
>>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```"""
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
outputs = self.clip(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
return outputs
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
return TFBaseModelOutputWithPooling(
last_hidden_state=output.last_hidden_state,
pooler_output=output.pooler_output,
hidden_states=hs,
attentions=attns,
)
class TFCLIPVisionModel(TFCLIPPreTrainedModel):
config_class = CLIPVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.clip = TFCLIPVisionMainLayer(config, name="clip")
@property
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
"""
Dummy inputs to build the network.
Returns:
`Dict[str, tf.Tensor]`: The dummy inputs.
"""
VISION_DUMMY_INPUTS = tf.random.uniform(
shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
)
return {"pixel_values": VISION_DUMMY_INPUTS}
@tf.function(
input_signature=[
{
"pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
}
]
)
def serving(self, inputs):
"""
Method used for serving the model.
Args:
inputs (`Dict[str, tf.Tensor]`):
The input of the saved model as a dictionary of tensors.
"""
output = self.call(inputs)
return self.serving_output(output)
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
def call(
self,
pixel_values: Optional[TFModelInputType] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
r"""
Returns:
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, TFCLIPVisionModel
>>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if "input_ids" in inputs:
inputs["pixel_values"] = inputs.pop("input_ids")
outputs = self.clip(
pixel_values=inputs["pixel_values"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
return outputs
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
return TFBaseModelOutputWithPooling(
last_hidden_state=output.last_hidden_state,
pooler_output=output.pooler_output,
hidden_states=hs,
attentions=attns,
)
@add_start_docstrings(CLIP_START_DOCSTRING)
class TFCLIPModel(TFCLIPPreTrainedModel):
config_class = CLIPConfig
def __init__(self, config: CLIPConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.clip = TFCLIPMainLayer(config, name="clip")
@property
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
"""
Dummy inputs to build the network.
Returns:
`Dict[str, tf.Tensor]`: The dummy inputs.
"""
VISION_DUMMY_INPUTS = tf.random.uniform(
shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
dtype=tf.float32,
)
return {
"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
"pixel_values": VISION_DUMMY_INPUTS,
}
@tf.function(
input_signature=[
{
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
"pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
}
]
)
def serving(self, inputs):
"""
Method used for serving the model.
Args:
inputs (`Dict[str, tf.Tensor]`):
The input of the saved model as a dictionary of tensors.
"""
output = self.call(inputs)
return self.serving_output(output)
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def get_text_features(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> tf.Tensor:
r"""
Returns:
text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`TFCLIPTextModel`].
Examples:
```python
>>> from transformers import CLIPTokenizer, TFCLIPModel
>>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> text_features = model.get_text_features(**inputs)
```"""
inputs = input_processing(
func=self.get_text_features,
config=self.config,
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
text_features = self.clip.get_text_features(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
)
return text_features
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[TFModelInputType] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> tf.Tensor:
r"""
Returns:
image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`TFCLIPVisionModel`].
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, TFCLIPModel
>>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> image_features = model.get_image_features(**inputs)
```"""
inputs = input_processing(
func=self.get_image_features,
config=self.config,
input_ids=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
if "input_ids" in inputs:
inputs["pixel_values"] = inputs.pop("input_ids")
image_features = self.clip.get_image_features(
pixel_values=inputs["pixel_values"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
)
return image_features
@add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
def call(
self,
input_ids: Optional[TFModelInputType] = None,
pixel_values: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
r"""
Returns:
Examples:
```python
>>> import tensorflow as tf
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, TFCLIPModel
>>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True)
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
```"""
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
position_ids=position_ids,
return_loss=return_loss,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
kwargs_call=kwargs,
)
outputs = self.clip(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
return_loss=inputs["return_loss"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
)
return outputs
def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
return output
...@@ -704,6 +704,57 @@ class TFCamembertModel: ...@@ -704,6 +704,57 @@ class TFCamembertModel:
requires_backends(self, ["tf"]) requires_backends(self, ["tf"])
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TFCLIPModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tf"])
def call(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFCLIPPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tf"])
def call(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFCLIPTextModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tf"])
def call(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFCLIPVisionModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tf"])
def call(self, *args, **kwargs):
requires_backends(self, ["tf"])
TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -29,6 +29,7 @@ from transformers.file_utils import is_torch_available, is_vision_available ...@@ -29,6 +29,7 @@ from transformers.file_utils import is_torch_available, is_vision_available
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flax_available, is_flax_available,
is_pt_flax_cross_test, is_pt_flax_cross_test,
is_pt_tf_cross_test,
require_torch, require_torch,
require_vision, require_vision,
slow, slow,
...@@ -581,6 +582,148 @@ class CLIPModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -581,6 +582,148 @@ class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
self.assertTrue(models_equal) self.assertTrue(models_equal)
# overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
@is_pt_tf_cross_test
def test_pt_tf_model_equivalence(self):
import numpy as np
import tensorflow as tf
import transformers
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning
if not hasattr(transformers, tf_model_class_name):
# transformers does not have TF version yet
return
tf_model_class = getattr(transformers, tf_model_class_name)
config.output_hidden_states = True
tf_model = tf_model_class(config)
pt_model = model_class(config)
# make sure only tf inputs are forward that actually exist in function args
tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
# remove all head masks
tf_input_keys.discard("head_mask")
tf_input_keys.discard("cross_attn_head_mask")
tf_input_keys.discard("decoder_head_mask")
pt_inputs = self._prepare_for_class(inputs_dict, model_class)
pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model.eval()
tf_inputs_dict = {}
for key, tensor in pt_inputs.items():
# skip key that does not exist in tf
if type(tensor) == bool:
tf_inputs_dict[key] = tensor
elif key == "input_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
# Check we can load pt model in tf and vice-versa with model => model functions
tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
# need to rename encoder-decoder "inputs" for PyTorch
# if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
# pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
with torch.no_grad():
pto = pt_model(**pt_inputs)
tfo = tf_model(tf_inputs_dict, training=False)
self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
continue
tf_out = tf_output.numpy()
pt_out = pt_output.numpy()
self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
if len(tf_out.shape) > 0:
tf_nans = np.copy(np.isnan(tf_out))
pt_nans = np.copy(np.isnan(pt_out))
pt_out[tf_nans] = 0
tf_out[tf_nans] = 0
pt_out[pt_nans] = 0
tf_out[pt_nans] = 0
max_diff = np.amax(np.abs(tf_out - pt_out))
self.assertLessEqual(max_diff, 4e-2)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
with tempfile.TemporaryDirectory() as tmpdirname:
pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
torch.save(pt_model.state_dict(), pt_checkpoint_path)
tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
tf_model.save_weights(tf_checkpoint_path)
pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model.eval()
tf_inputs_dict = {}
for key, tensor in pt_inputs.items():
# skip key that does not exist in tf
if type(tensor) == bool:
tensor = np.array(tensor, dtype=bool)
tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
elif key == "input_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
# need to rename encoder-decoder "inputs" for PyTorch
# if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
# pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
with torch.no_grad():
pto = pt_model(**pt_inputs)
tfo = tf_model(tf_inputs_dict)
self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
continue
tf_out = tf_output.numpy()
pt_out = pt_output.numpy()
self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
if len(tf_out.shape) > 0:
tf_nans = np.copy(np.isnan(tf_out))
pt_nans = np.copy(np.isnan(pt_out))
pt_out[tf_nans] = 0
tf_out[tf_nans] = 0
pt_out[pt_nans] = 0
tf_out[pt_nans] = 0
max_diff = np.amax(np.abs(tf_out - pt_out))
self.assertLessEqual(max_diff, 4e-2)
# overwrite from common since FlaxCLIPModel returns nested output # overwrite from common since FlaxCLIPModel returns nested output
# which is not supported in the common test # which is not supported in the common test
@is_pt_flax_cross_test @is_pt_flax_cross_test
......
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the TensorFlow CLIP model. """
import inspect
import os
import tempfile
import unittest
from importlib import import_module
import requests
from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
from transformers.file_utils import is_tf_available, is_vision_available
from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_vision, slow
from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_tf_available():
import numpy as np
import tensorflow as tf
from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings
from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import CLIPProcessor
class TFCLIPVisionModelTester:
def __init__(
self,
parent,
batch_size=12,
image_size=30,
patch_size=2,
num_channels=3,
is_training=True,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
def get_config(self):
return CLIPVisionConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, pixel_values):
model = TFCLIPVisionModel(config=config)
result = model(pixel_values, training=False)
# expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
image_size = (self.image_size, self.image_size)
patch_size = (self.patch_size, self.patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_tf
class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else ()
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
test_onnx = False
def setUp(self):
self.model_tester = TFCLIPVisionModelTester(self)
self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_inputs_embeds(self):
# CLIP does not use inputs_embeds
pass
def test_graph_mode_with_inputs_embeds(self):
# CLIP does not use inputs_embeds
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.call)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
# in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
image_size = (self.model_tester.image_size, self.model_tester.image_size)
patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
seq_len = num_patches + 1
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
# CLIP has a different seq_length
image_size = (self.model_tester.image_size, self.model_tester.image_size)
patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
seq_length = num_patches + 1
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[seq_length, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@slow
def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPVisionModel.from_pretrained(model_name, from_pt=True)
self.assertIsNotNone(model)
class TFCLIPTextModelTester:
def __init__(
self,
parent,
batch_size=12,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=512,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return CLIPTextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = TFCLIPTextModel(config=config)
result = model(input_ids, attention_mask=input_mask, training=False)
result = model(input_ids, training=False)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_tf
class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
test_pruning = False
test_head_masking = False
test_onnx = False
def setUp(self):
self.model_tester = TFCLIPTextModelTester(self)
self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_inputs_embeds(self):
# CLIP does not use inputs_embeds
pass
@slow
def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPTextModel.from_pretrained(model_name, from_pt=True)
self.assertIsNotNone(model)
class TFCLIPModelTester:
def __init__(self, parent, is_training=True):
self.parent = parent
self.text_model_tester = TFCLIPTextModelTester(parent)
self.vision_model_tester = TFCLIPVisionModelTester(parent)
self.is_training = is_training
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, input_ids, attention_mask, pixel_values
def get_config(self):
return CLIPConfig.from_text_vision_configs(
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
)
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
model = TFCLIPModel(config)
result = model(input_ids, pixel_values, attention_mask, training=False)
self.parent.assertEqual(
result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
)
self.parent.assertEqual(
result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"return_loss": True,
}
return config, inputs_dict
@require_tf
class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
test_onnx = False
def setUp(self):
self.model_tester = TFCLIPModelTester(self)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
# hidden_states are tested in individual model tests
def test_hidden_states_output(self):
pass
# input_embeds are tested in individual model tests
def test_inputs_embeds(self):
pass
# CLIPModel does not have input/output embeddings
def test_model_common_attributes(self):
pass
# overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of
# `symbolic_inputs` failed.
def test_keras_save_load(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# remove `return_loss` to make code work
if self.__class__.__name__ == "TFCLIPModelTest":
inputs_dict.pop("return_loss", None)
tf_main_layer_classes = set(
module_member
for model_class in self.all_model_classes
for module in (import_module(model_class.__module__),)
for module_member_name in dir(module)
if module_member_name.endswith("MainLayer")
# This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
for module_member in (getattr(module, module_member_name),)
if isinstance(module_member, type)
and tf.keras.layers.Layer in module_member.__bases__
and getattr(module_member, "_keras_serializable", False)
)
for main_layer_class in tf_main_layer_classes:
# T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
if "T5" in main_layer_class.__name__:
# Take the same values than in TFT5ModelTester for this shared layer
shared = TFSharedEmbeddings(99, 32, name="shared")
config.use_cache = inputs_dict.pop("use_cache", None)
main_layer = main_layer_class(config, embed_tokens=shared)
else:
main_layer = main_layer_class(config)
symbolic_inputs = {
name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
}
model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
outputs = model(inputs_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "keras_model.h5")
model.save(filepath)
if "T5" in main_layer_class.__name__:
model = tf.keras.models.load_model(
filepath,
custom_objects={
main_layer_class.__name__: main_layer_class,
"TFSharedEmbeddings": TFSharedEmbeddings,
},
)
else:
model = tf.keras.models.load_model(
filepath, custom_objects={main_layer_class.__name__: main_layer_class}
)
assert isinstance(model, tf.keras.Model)
after_outputs = model(inputs_dict)
self.assert_outputs_same(after_outputs, outputs)
# overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
@is_pt_tf_cross_test
def test_pt_tf_model_equivalence(self):
import torch
import transformers
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning
pt_model_class = getattr(transformers, pt_model_class_name)
config.output_hidden_states = True
tf_model = model_class(config)
pt_model = pt_model_class(config)
# Check we can load pt model in tf and vice-versa with model => model functions
tf_model = transformers.load_pytorch_model_in_tf2_model(
tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
)
pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model.eval()
pt_inputs_dict = {}
for name, key in self._prepare_for_class(inputs_dict, model_class).items():
if type(key) == bool:
pt_inputs_dict[name] = key
elif name == "input_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
# need to rename encoder-decoder "inputs" for PyTorch
if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
with torch.no_grad():
pto = pt_model(**pt_inputs_dict)
tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
continue
tf_out = tf_output.numpy()
pt_out = pt_output.numpy()
self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
if len(tf_out.shape) > 0:
tf_nans = np.copy(np.isnan(tf_out))
pt_nans = np.copy(np.isnan(pt_out))
pt_out[tf_nans] = 0
tf_out[tf_nans] = 0
pt_out[pt_nans] = 0
tf_out[pt_nans] = 0
max_diff = np.amax(np.abs(tf_out - pt_out))
self.assertLessEqual(max_diff, 4e-2)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
with tempfile.TemporaryDirectory() as tmpdirname:
pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
torch.save(pt_model.state_dict(), pt_checkpoint_path)
tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
tf_model.save_weights(tf_checkpoint_path)
pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model.eval()
pt_inputs_dict = {}
for name, key in self._prepare_for_class(inputs_dict, model_class).items():
if type(key) == bool:
key = np.array(key, dtype=bool)
pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
elif name == "input_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
# need to rename encoder-decoder "inputs" for PyTorch
if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
with torch.no_grad():
pto = pt_model(**pt_inputs_dict)
tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
continue
tf_out = tf_output.numpy()
pt_out = pt_output.numpy()
self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
if len(tf_out.shape) > 0:
tf_nans = np.copy(np.isnan(tf_out))
pt_nans = np.copy(np.isnan(pt_out))
pt_out[tf_nans] = 0
tf_out[tf_nans] = 0
pt_out[pt_nans] = 0
tf_out[pt_nans] = 0
max_diff = np.amax(np.abs(tf_out - pt_out))
self.assertLessEqual(max_diff, 4e-2)
@slow
def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPModel.from_pretrained(model_name, from_pt=True)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@require_vision
class TFCLIPModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "openai/clip-vit-base-patch32"
model = TFCLIPModel.from_pretrained(model_name, from_pt=True)
processor = CLIPProcessor.from_pretrained(model_name)
image = prepare_img()
inputs = processor(
text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
)
outputs = model(**inputs, training=False)
# verify the logits
self.assertEqual(
outputs.logits_per_image.shape,
tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
)
self.assertEqual(
outputs.logits_per_text.shape,
tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
expected_logits = tf.constant([[24.5701, 19.3049]])
tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
...@@ -282,6 +282,8 @@ class TFModelTesterMixin: ...@@ -282,6 +282,8 @@ class TFModelTesterMixin:
for module in (import_module(model_class.__module__),) for module in (import_module(model_class.__module__),)
for module_member_name in dir(module) for module_member_name in dir(module)
if module_member_name.endswith("MainLayer") if module_member_name.endswith("MainLayer")
# This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
for module_member in (getattr(module, module_member_name),) for module_member in (getattr(module, module_member_name),)
if isinstance(module_member, type) if isinstance(module_member, type)
and tf.keras.layers.Layer in module_member.__bases__ and tf.keras.layers.Layer in module_member.__bases__
...@@ -458,7 +460,7 @@ class TFModelTesterMixin: ...@@ -458,7 +460,7 @@ class TFModelTesterMixin:
"input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
} }
# TODO: A better way to handle vision models # TODO: A better way to handle vision models
elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification"]: elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification", "TFCLIPVisionModel"]:
inputs = tf.keras.Input( inputs = tf.keras.Input(
batch_shape=( batch_shape=(
3, 3,
...@@ -469,6 +471,20 @@ class TFModelTesterMixin: ...@@ -469,6 +471,20 @@ class TFModelTesterMixin:
name="pixel_values", name="pixel_values",
dtype="float32", dtype="float32",
) )
elif model_class.__name__ in ["TFCLIPModel"]:
inputs = {
"input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
"pixel_values": tf.keras.Input(
batch_shape=(
3,
self.model_tester.vision_model_tester.num_channels,
self.model_tester.vision_model_tester.image_size,
self.model_tester.vision_model_tester.image_size,
),
name="pixel_values",
dtype="float32",
),
}
elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
else: else:
...@@ -1244,6 +1260,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): ...@@ -1244,6 +1260,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
return output return output
def random_attention_mask(shape, rng=None, name=None, dtype=None):
attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
# make sure that at least one token is attended to for each batch
attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
return attn_mask
def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
"""Creates a random float32 tensor""" """Creates a random float32 tensor"""
if rng is None: if rng is None:
......
...@@ -22,7 +22,7 @@ import unittest ...@@ -22,7 +22,7 @@ import unittest
from transformers import ViTConfig from transformers import ViTConfig
from transformers.file_utils import cached_property, is_tf_available, is_vision_available from transformers.file_utils import cached_property, is_tf_available, is_vision_available
from transformers.testing_utils import require_tf, require_vision, slow from transformers.testing_utils import require_tf, require_vision, slow, tooslow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
...@@ -200,7 +200,7 @@ class TFViTModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -200,7 +200,7 @@ class TFViTModelTest(TFModelTesterMixin, unittest.TestCase):
# overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated # overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated
# in a different way than in text models. # in a different way than in text models.
@slow @tooslow
def test_saved_model_creation_extended(self): def test_saved_model_creation_extended(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True config.output_hidden_states = True
......
...@@ -111,6 +111,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -111,6 +111,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"BeitForMaskedImageModeling", "BeitForMaskedImageModeling",
"CLIPTextModel", "CLIPTextModel",
"CLIPVisionModel", "CLIPVisionModel",
"TFCLIPTextModel",
"TFCLIPVisionModel",
"FlaxCLIPTextModel", "FlaxCLIPTextModel",
"FlaxCLIPVisionModel", "FlaxCLIPVisionModel",
"FlaxWav2Vec2ForCTC", "FlaxWav2Vec2ForCTC",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment