Unverified Commit c770600f authored by jiqing-feng's avatar jiqing-feng Committed by GitHub
Browse files

TVP model (#25856)

* tvp model for video grounding

add tokenizer auto

fix param in TVPProcessor

add docs

clear comments and enable different torch dtype

add image processor test and model test and fix code style

* fix conflict

* fix model doc

* fix image processing tests

* fix tvp tests

* remove torch in processor

* fix grammar error

* add more details on tvp.md

* fix model arch for loss, grammar, and processor

* add docstring and do not regard TvpTransformer, TvpVisionModel as individual model

* use pad_image

* update copyright

* control first downsample stride

* reduce first only works for ResNetBottleNeckLayer

* fix param name

* fix style

* add testing

* fix style

* rm init_weight

* fix style

* add post init

* fix comments

* do not test TvpTransformer

* fix warning

* fix style

* fix example

* fix config map

* add link in config

* fix comments

* fix style

* rm useless param

* change attention

* change test

* add notes

* fix comments

* fix tvp

* import checkpointing

* fix gradient checkpointing

* Use a more accurate example in readme

* update

* fix copy

* fix style

* update readme

* delete print

* remove tvp test_forward_signature

* remove TvpTransformer

* fix test init model

* merge main and make style

* fix tests and others

* fix image processor

* fix style and model_input_names

* fix tests
parent f5c9738f
This diff is collapsed.
This diff is collapsed.
# coding=utf-8
# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License=, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing=, software
# distributed under the License is distributed on an "AS IS" BASIS=,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for TVP.
"""
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class TvpProcessor(ProcessorMixin):
r"""
Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
[`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
[`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
Args:
image_processor ([`TvpImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`BertTokenizerFast`], *optional*):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "TvpImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
channels.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
"""
max_text_length = kwargs.pop("max_text_length", None)
if text is None and videos is None:
raise ValueError("You have to specify either text or videos. Both cannot be none.")
encoding = {}
if text is not None:
textual_input = self.tokenizer.batch_encode_plus(
text,
truncation=True,
padding="max_length",
max_length=max_text_length,
pad_to_max_length=True,
return_tensors=return_tensors,
return_token_type_ids=False,
**kwargs,
)
encoding.update(textual_input)
if videos is not None:
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
encoding.update(image_features)
return BatchEncoding(data=encoding, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_video_grounding(self, logits, video_durations):
"""
Compute the time of the video.
Args:
logits (`torch.Tensor`):
The logits output of TvpForVideoGrounding.
video_durations (`float`):
The video's duration.
Returns:
start (`float`):
The start time of the video.
end (`float`):
The end time of the video.
"""
start, end = (
round(logits.tolist()[0][0] * video_durations, 1),
round(logits.tolist()[0][1] * video_durations, 1),
)
return start, end
@property
# Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
......@@ -7829,6 +7829,30 @@ class TvltPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
TVP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TvpForVideoGrounding(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class TvpModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class TvpPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class UMT5EncoderModel(metaclass=DummyObject):
_backends = ["torch"]
......
......@@ -485,6 +485,13 @@ class TvltImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class TvpImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class VideoMAEFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
# coding=utf-8
# Copyright 2023 The Intel Team Authors, The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.image_transforms import PaddingMode
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import TvpImageProcessor
class TvpImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
do_resize: bool = True,
size: Dict[str, int] = {"longest_edge": 40},
do_center_crop: bool = False,
crop_size: Dict[str, int] = None,
do_rescale: bool = False,
rescale_factor: Union[int, float] = 1 / 255,
do_pad: bool = True,
pad_size: Dict[str, int] = {"height": 80, "width": 80},
fill: int = None,
pad_mode: PaddingMode = None,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073],
image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711],
batch_size=2,
min_resolution=40,
max_resolution=80,
num_channels=3,
num_frames=2,
):
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_pad = do_pad
self.pad_size = pad_size
self.fill = fill
self.pad_mode = pad_mode
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.batch_size = batch_size
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.num_frames = num_frames
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
"do_rescale": self.do_rescale,
"do_center_crop": self.do_center_crop,
"do_pad": self.do_pad,
"pad_size": self.pad_size,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to TvpImageProcessor,
assuming do_resize is set to True with a scalar size.
"""
if not batched:
return (int(self.pad_size["height"]), int(self.pad_size["width"]))
else:
expected_values = []
for image in image_inputs:
expected_height, expected_width = self.get_expected_values([image])
expected_values.append((expected_height, expected_width))
expected_height = max(expected_values, key=lambda item: item[0])[0]
expected_width = max(expected_values, key=lambda item: item[1])[1]
return expected_height, expected_width
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = TvpImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = TvpImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_center_crop"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "pad_size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"longest_edge": 40})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size={"longest_edge": 12})
self.assertEqual(image_processor.size, {"longest_edge": 12})
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL videos
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
encoded_videos = image_processing(
video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
encoded_videos = image_processing(
video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], torch.Tensor)
# Test not batched input
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
# coding=utf-8
# Copyright 2023 The Intel Team Authors, The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch TVP model. """
import unittest
from transformers import ResNetConfig, TvpConfig
from transformers.testing_utils import require_torch, require_vision, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import TvpForVideoGrounding, TvpModel
if is_vision_available():
from PIL import Image
from transformers import TvpImageProcessor
# Copied from test.models.videomae.test_modeling_videomae.VideoMAEModelTester with VideoMAE->TVP
class TVPModelTester:
def __init__(
self,
parent,
batch_size=1,
seq_length=2,
alpha=1.0,
beta=0.1,
visual_prompter_type="framepad",
visual_prompter_apply="replace",
num_frames=2,
max_img_size=448,
visual_prompt_size=96,
vocab_size=100,
hidden_size=32,
intermediate_size=32,
num_hidden_layers=2,
num_attention_heads=4,
max_position_embeddings=30,
max_grid_col_position_embeddings=30,
max_grid_row_position_embeddings=30,
hidden_dropout_prob=0.1,
hidden_act="gelu",
layer_norm_eps=1e-12,
initializer_range=0.02,
pad_token_id=0,
type_vocab_size=2,
attention_probs_dropout_prob=0.1,
):
self.parent = parent
self.batch_size = batch_size
self.input_id_length = seq_length
self.seq_length = seq_length + 10 + 784 # include text prompt length and visual input length
self.alpha = alpha
self.beta = beta
self.visual_prompter_type = visual_prompter_type
self.visual_prompter_apply = visual_prompter_apply
self.num_frames = num_frames
self.max_img_size = max_img_size
self.visual_prompt_size = visual_prompt_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.pad_token_id = pad_token_id
self.type_vocab_size = type_vocab_size
self.is_training = False
self.num_channels = 3
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.input_id_length], self.vocab_size)
attention_mask = random_attention_mask([self.batch_size, self.input_id_length])
pixel_values = floats_tensor(
[self.batch_size, self.num_frames, self.num_channels, self.max_img_size, self.max_img_size]
)
config = self.get_config()
return (config, input_ids, pixel_values, attention_mask)
def get_config(self):
resnet_config = ResNetConfig(
num_channels=3,
embeddings_size=64,
hidden_sizes=[64, 128],
depths=[2, 2],
hidden_act="relu",
out_features=["stage2"],
out_indices=[2],
)
return TvpConfig(
backbone_config=resnet_config,
alpha=self.alpha,
beta=self.beta,
visual_prompter_type=self.visual_prompter_type,
visual_prompter_apply=self.visual_prompter_apply,
num_frames=self.num_frames,
max_img_size=self.max_img_size,
visual_prompt_size=self.visual_prompt_size,
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
max_grid_col_position_embeddings=self.max_grid_col_position_embeddings,
max_grid_row_position_embeddings=self.max_grid_row_position_embeddings,
layer_norm_eps=self.layer_norm_eps,
initializer_range=self.initializer_range,
pad_token_id=self.pad_token_id,
type_vocab_size=self.type_vocab_size,
)
def create_and_check_model(self, config, input_ids, pixel_values, attention_mask):
model = TvpModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, pixel_values, attention_mask)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, pixel_values, attention_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "pixel_values": pixel_values, "attention_mask": attention_mask}
return config, inputs_dict
@require_torch
class TVPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as TVP does not use, inputs_embeds.
The seq_length in TVP contain textual and visual inputs, and prompt.
"""
all_model_classes = (TvpModel, TvpForVideoGrounding) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": TvpModel, "temporal-video-grounding": TvpForVideoGrounding}
if is_torch_available()
else {}
)
def setUp(self):
self.model_tester = TVPModelTester(self)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="TVP does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="TVPModel does not have input/output embeddings")
def test_model_common_attributes(self):
pass
# override as the `logit_scale` parameter initilization is different for TVP
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
# params are randomly initialized.
self.assertAlmostEqual(
param.data.mean().item(),
0.0,
delta=1.0,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_vision
@require_torch
class TvpModelIntegrationTests(unittest.TestCase):
@cached_property
def default_image_processor(self):
return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp") if is_vision_available() else None
def test_inference_no_head(self):
model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
input_ids = torch.tensor([[1, 2]])
attention_mask = torch.tensor([[1, 1]])
encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
with torch.no_grad():
outputs = model(**encoding)
expected_shape = torch.Size((1, 796, 128))
assert outputs.last_hidden_state.shape == expected_shape
expected_slice = torch.tensor(
[[-0.4902, -0.4121, -1.7872], [-0.2184, 2.1211, -0.9371], [0.1180, 0.5003, -0.1727]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
def test_inference_with_head(self):
model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
input_ids = torch.tensor([[1, 2]])
attention_mask = torch.tensor([[1, 1]])
encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
with torch.no_grad():
outputs = model(**encoding)
expected_shape = torch.Size((1, 2))
assert outputs.logits.shape == expected_shape
expected_slice = torch.tensor([[0.5061, 0.4988]]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits, expected_slice, atol=1e-4))
......@@ -114,7 +114,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model.
"BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model.
"BarkCausalModel", # Building part of bigger (tested) model.
"BarkModel", # Does not have a forward signature - generation tested with integration tests
"BarkModel", # Does not have a forward signature - generation tested with integration tests.
"SeamlessM4TTextToUnitModel", # Building part of bigger (tested) model.
"SeamlessM4TCodeHifiGan", # Building part of bigger (tested) model.
"SeamlessM4TTextToUnitForConditionalGeneration", # Building part of bigger (tested) model.
......@@ -293,6 +293,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"SeamlessM4TTextToUnitForConditionalGeneration",
"SeamlessM4TCodeHifiGan",
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
"TvpForVideoGrounding",
]
# DO NOT edit this list!
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment