Unverified Commit f9a0008d authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add VideoMAE (#17821)



* First draft

* Add VideoMAEForVideoClassification

* Improve conversion script

* Add VideoMAEForPreTraining

* Add VideoMAEFeatureExtractor

* Improve VideoMAEFeatureExtractor

* Improve docs

* Add first draft of model tests

* Improve VideoMAEForPreTraining

* Fix base_model_prefix

* Make model take pixel_values of shape (B, T, C, H, W)

* Add loss computation of VideoMAEForPreTraining

* Improve tests

* Improve model testsé

* Make all tests pass

* Add VideoMAE to main README

* Add tests for VideoMAEFeatureExtractor

* Add integration test

* Improve conversion script

* Rename patch embedding class

* Remove VideoMAELayer from init

* Update design of patch embeddings

* Improve comments

* Improve conversion script

* Improve conversion script

* Add conversion of pretrained model

* Add loss verification of pretrained model

* Add loss verification of unnormalized targets

* Add integration test for pretraining model

* Apply suggestions from code review

* Fix bug to make feature extractor resize only shorter edge

* Address more comments

* Improve normalization of videos

* Add doc examples

* Move constants to dedicated script

* Remove scripts

* Transfer checkpoints, fix docs

* Update script

* Update image mean and std

* Fix doc tests

* Set return_tensors to NumPy by default

* Revert the previous change
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent 672b6626
......@@ -22,6 +22,7 @@
from packaging import version
from .. import __version__
from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from .doc import (
add_code_sample_docstrings,
add_end_docstrings,
......
IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
......@@ -406,6 +406,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
MODEL_FOR_VISION_2_SEQ_MAPPING = None
......@@ -572,6 +575,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject):
requires_backends(self, ["torch"])
class AutoModelForVideoClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class AutoModelForVision2Seq(metaclass=DummyObject):
_backends = ["torch"]
......@@ -4753,6 +4763,37 @@ class VanPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
class VideoMAEForPreTraining(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class VideoMAEForVideoClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class VideoMAEModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class VideoMAEPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
VILT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -150,6 +150,13 @@ class SegformerFeatureExtractor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class VideoMAEFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class ViltFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import VideoMAEFeatureExtractor
class VideoMAEFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_frames=10,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=18,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_frames = num_frames
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def prepare_feat_extract_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
}
@require_torch
@require_vision
class VideoMAEFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
feature_extraction_class = VideoMAEFeatureExtractor if is_vision_available() else None
def setUp(self):
self.feature_extract_tester = VideoMAEFeatureExtractionTester(self)
@property
def feat_extract_dict(self):
return self.feature_extract_tester.prepare_feat_extract_dict()
def test_feat_extract_properties(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
self.assertTrue(hasattr(feature_extractor, "image_mean"))
self.assertTrue(hasattr(feature_extractor, "image_std"))
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
self.assertTrue(hasattr(feature_extractor, "do_resize"))
self.assertTrue(hasattr(feature_extractor, "size"))
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PIL videos
video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_numpy(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random numpy tensors
video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_pytorch(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PyTorch tensors
video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], torch.Tensor)
# Test not batched input
encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_frames,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch VideoMAE model. """
import copy
import inspect
import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from transformers import VideoMAEConfig
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
from torch import nn
from transformers import (
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
VideoMAEForPreTraining,
VideoMAEForVideoClassification,
VideoMAEModel,
)
from transformers.models.videomae.modeling_videomae import VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from transformers import VideoMAEFeatureExtractor
class VideoMAEModelTester:
def __init__(
self,
parent,
batch_size=13,
image_size=10,
num_channels=3,
patch_size=2,
tubelet_size=2,
num_frames=2,
is_training=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
type_sequence_label_size=10,
initializer_range=0.02,
mask_ratio=0.9,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.num_channels = num_channels
self.patch_size = patch_size
self.tubelet_size = tubelet_size
self.num_frames = num_frames
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.mask_ratio = mask_ratio
self.scope = scope
# in VideoMAE, the number of tokens equals num_frames/tubelet_size * num_patches per frame
self.num_patches_per_frame = (image_size // patch_size) ** 2
self.seq_length = (num_frames // tubelet_size) * self.num_patches_per_frame
# use this variable to define bool_masked_pos
self.num_masks = int(mask_ratio * self.seq_length)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
)
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return VideoMAEConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
num_frames=self.num_frames,
tubelet_size=self.tubelet_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
is_decoder=False,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, pixel_values, labels):
model = VideoMAEModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_pretraining(self, config, pixel_values, labels):
model = VideoMAEForPreTraining(config)
model.to(torch_device)
model.eval()
# important: each video needs to have the same number of masked patches
# hence we define a single mask, which we then repeat for each example in the batch
mask = torch.ones((self.num_masks,))
mask = torch.cat([mask, torch.zeros(self.seq_length - mask.size(0))])
bool_masked_pos = mask.expand(self.batch_size, -1).bool()
result = model(pixel_values, bool_masked_pos)
# model only returns predictions for masked patches
num_masked_patches = mask.sum().item()
decoder_num_labels = 3 * self.tubelet_size * self.patch_size**2
self.parent.assertEqual(result.logits.shape, (self.batch_size, num_masked_patches, decoder_num_labels))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as VideoMAE does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (
(VideoMAEModel, VideoMAEForPreTraining, VideoMAEForVideoClassification) if is_torch_available() else ()
)
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
test_head_masking = False
def setUp(self):
self.model_tester = VideoMAEModelTester(self)
self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=37)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if model_class == VideoMAEForPreTraining:
# important: each video needs to have the same number of masked patches
# hence we define a single mask, which we then repeat for each example in the batch
mask = torch.ones((self.model_tester.num_masks,))
mask = torch.cat([mask, torch.zeros(self.model_tester.seq_length - mask.size(0))])
bool_masked_pos = mask.expand(self.model_tester.batch_size, -1).bool()
inputs_dict["bool_masked_pos"] = bool_masked_pos.to(torch_device)
if return_labels:
if model_class in [
*get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
]:
inputs_dict["labels"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
return inputs_dict
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="VideoMAE does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_pretraining(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = VideoMAEModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_attention_outputs(self):
if not self.has_attentions:
pass
else:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes:
num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
seq_len = (
num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
)
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(out_len + 1, len(outputs))
self_attentions = outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = self.model_tester.num_hidden_layers + 1
self.assertEqual(len(hidden_states), expected_num_layers)
num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
seq_length = num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[seq_length, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
# We will verify our results on a video of eating spaghetti
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
def prepare_video():
file = hf_hub_download(repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy")
video = np.load(file)
return list(video)
@require_torch
@require_vision
class VideoMAEModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
# logits were tested with a different mean and std, so we use the same here
return (
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
if is_vision_available()
else None
)
@slow
def test_inference_for_video_classification(self):
model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics").to(
torch_device
)
feature_extractor = self.default_feature_extractor
video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 400))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([0.3669, -0.0688, -0.2421]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
@slow
def test_inference_for_pretraining(self):
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
feature_extractor = self.default_feature_extractor
video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
# add boolean mask, indicating which patches to mask
local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
inputs["bool_masked_pos"] = torch.load(local_path)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor(
[[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]], device=torch_device
)
self.assertEqual(outputs.logits.shape, expected_shape)
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
# verify the loss (`config.norm_pix_loss` = `True`)
expected_loss = torch.tensor([0.5142], device=torch_device)
self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
# verify the loss (`config.norm_pix_loss` = `False`)
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short", norm_pix_loss=False).to(
torch_device
)
with torch.no_grad():
outputs = model(**inputs)
expected_loss = torch.tensor(torch.tensor([0.6469]), device=torch_device)
self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
......@@ -48,47 +48,89 @@ SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
if equal_resolution:
image_inputs = []
for i in range(feature_extract_tester.batch_size):
if equal_resolution:
width = height = feature_extract_tester.max_resolution
else:
# To avoid getting image width/height 0
min_resolution = feature_extract_tester.min_resolution
if getattr(feature_extract_tester, "size_divisor", None):
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
image_inputs.append(
np.random.randint(
255,
size=(
feature_extract_tester.num_channels,
feature_extract_tester.max_resolution,
feature_extract_tester.max_resolution,
width,
height,
),
dtype=np.uint8,
)
)
else:
image_inputs = []
# To avoid getting image width/height 0
min_resolution = feature_extract_tester.min_resolution
if getattr(feature_extract_tester, "size_divisor", None):
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
for i in range(feature_extract_tester.batch_size):
width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
image_inputs.append(
np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
)
if torchify:
image_inputs = [torch.from_numpy(image) for image in image_inputs]
return image_inputs
def prepare_video(feature_extract_tester, width=10, height=10, numpify=False, torchify=False):
"""This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
video = []
for i in range(feature_extract_tester.num_frames):
video.append(np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8))
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
if torchify:
image_inputs = [torch.from_numpy(x) for x in image_inputs]
video = [torch.from_numpy(frame) for frame in video]
return image_inputs
return video
def prepare_video_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
"""This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
One can specify whether the videos are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
video_inputs = []
for i in range(feature_extract_tester.batch_size):
if equal_resolution:
width = height = feature_extract_tester.max_resolution
else:
width, height = np.random.choice(
np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
)
video = prepare_video(
feature_extract_tester=feature_extract_tester,
width=width,
height=height,
numpify=numpify,
torchify=torchify,
)
video_inputs.append(video)
return video_inputs
class FeatureExtractionSavingTestMixin:
......
......@@ -99,6 +99,7 @@ if is_torch_available():
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
MODEL_MAPPING,
AdaptiveEmbedding,
AutoModelForCausalLM,
......@@ -182,6 +183,7 @@ class ModelTesterMixin:
*get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
*get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
*get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
*get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
]:
inputs_dict["labels"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment