Unverified Commit e71f2863 authored by Raushan Turganbay's avatar Raushan Turganbay Committed by GitHub
Browse files

Add LLaVa NeXT Video (#31252)



* squash into single commit

* run diff once more

* docstring

* tests

* minor chnages and ready to go

* Update src/transformers/models/llava_next_video/processing_llava_next_video.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/vipllava/test_modeling_vipllava.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* [run-slow] llava-next-video

* [run-slow] llava-next-video

* [run-slow] llava_next_video

* fix two tests

* fix slow tests

* remove logit checks due to numeric errors

* run test once more

* [run-slow] llava_next_video

* final try to pass the test

* [run-slow] llava_next_video

* [run-slow] llava_next_video

* [run-slow] llava_next_video

* style

* fix

* style

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent b1ec7454
...@@ -359,6 +359,13 @@ class LlavaNextImageProcessor(metaclass=DummyObject): ...@@ -359,6 +359,13 @@ class LlavaNextImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class LlavaNextVideoImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class Mask2FormerImageProcessor(metaclass=DummyObject): class Mask2FormerImageProcessor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
...@@ -204,6 +204,14 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase ...@@ -204,6 +204,14 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
def test_training_gradient_checkpointing_use_reentrant_false(self): def test_training_gradient_checkpointing_use_reentrant_false(self):
pass pass
@unittest.skip(reason="Compile not yet supported because in LLava models")
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Compile not yet supported because in LLava models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@require_torch @require_torch
class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
......
...@@ -265,6 +265,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes ...@@ -265,6 +265,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
def test_cpu_offload(self): def test_cpu_offload(self):
pass pass
@unittest.skip(reason="Compile not yet supported because in LLava models")
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Compile not yet supported because in LLava models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@require_torch @require_torch
class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
......
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import LlavaNextVideoImageProcessor
class LlavaNextVideoProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=5,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
):
size = size if size is not None else {"shortest_edge": 20}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
def expected_output_image_shape(self, images):
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
images = prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
# let's simply copy the frames to fake a long video-clip
if numpify or torchify:
videos = []
for image in images:
if numpify:
video = image[None, ...].repeat(8, 0)
else:
video = image[None, ...].repeat(8, 1, 1, 1)
videos.append(video)
else:
videos = []
for pil_image in images:
videos.append([pil_image] * 8)
return videos
@require_torch
@require_vision
class LlavaNextVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = LlavaNextVideoImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = LlavaNextVideoProcessingTester(self)
@property
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_center_crop"))
self.assertTrue(hasattr(image_processing, "center_crop"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 20})
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
for video in video_inputs:
self.assertIsInstance(video[0], Image.Image)
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
expected_output_video_shape = (1, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
expected_output_video_shape = (5, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, np.ndarray)
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
expected_output_video_shape = (1, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
expected_output_video_shape = (5, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
for video in video_inputs:
self.assertIsInstance(video, torch.Tensor)
# Test not batched input
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
expected_output_video_shape = (1, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
expected_output_video_shape = (5, 8, 3, 18, 18)
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("LlavaNextVideoImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Llava-NeXT model."""
import gc
import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from transformers import (
AutoProcessor,
LlavaNextVideoConfig,
LlavaNextVideoForConditionalGeneration,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
require_bitsandbytes,
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
floats_tensor,
ids_tensor,
)
if is_torch_available():
import torch
else:
is_torch_greater_or_equal_than_2_0 = False
if is_vision_available():
from PIL import Image
class LlavaNextVideoVisionText2TextModelTester:
def __init__(
self,
parent,
ignore_index=-100,
image_token_index=0,
video_token_index=1,
projector_hidden_act="gelu",
seq_length=7,
vision_feature_select_strategy="default",
vision_feature_layer=-1,
text_config={
"model_type": "llama",
"seq_length": 7,
"is_training": True,
"use_input_mask": True,
"use_token_type_ids": False,
"use_labels": True,
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 580,
"type_vocab_size": 16,
"type_sequence_label_size": 2,
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
},
is_training=True,
vision_config={
"image_size": 16,
"patch_size": 2,
"num_channels": 3,
"is_training": True,
"hidden_size": 32,
"projection_dim": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 37,
"dropout": 0.1,
"attention_dropout": 0.1,
"initializer_range": 0.02,
},
):
self.parent = parent
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.video_token_index = video_token_index
self.projector_hidden_act = projector_hidden_act
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
self.text_config = text_config
self.vision_config = vision_config
self.seq_length = seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.is_training = is_training
self.batch_size = 3
self.num_channels = 3
self.image_size = 30
self.encoder_seq_length = 468
self.image_grid_pinpoints = [[32, 32]]
def get_config(self):
return LlavaNextVideoConfig(
text_config=self.text_config,
vision_config=self.vision_config,
ignore_index=self.ignore_index,
image_token_index=self.image_token_index,
video_token_index=self.video_token_index,
projector_hidden_act=self.projector_hidden_act,
vision_feature_select_strategy=self.vision_feature_select_strategy,
vision_feature_layer=self.vision_feature_layer,
image_grid_pinpoints=self.image_grid_pinpoints,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
5,
self.vision_config["num_channels"],
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
pixel_values_videos = floats_tensor(
[
self.batch_size,
8,
self.vision_config["num_channels"],
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
config = self.get_config()
return config, pixel_values, pixel_values_videos
def prepare_config_and_inputs_for_common(self):
config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
# make attention mask left-padded to avoid issues with "model has no attribute padding_side"
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
attention_mask[:, :1] = 0
# we are giving 3 images and videos let's make sure we pass in 3 special tokens
input_ids[:, 1] = config.image_token_index
input_ids[:, 2] = config.video_token_index
labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
# maskout where the image/video token is
labels[:, 1] == self.ignore_index
labels[:, 2] == self.ignore_index
inputs_dict = {
"pixel_values": pixel_values,
"pixel_values_videos": pixel_values_videos,
"image_sizes": torch.tensor(
[[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size
),
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
return config, inputs_dict
def create_and_check_llava_next_video_model_fp16_forward(
self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes
):
model = LlavaNextVideoForConditionalGeneration(config=config)
model.to(torch_device)
model.half()
model.eval()
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
image_sizes=image_sizes,
pixel_values=pixel_values.to(torch.bfloat16),
pixel_values_videos=pixel_values_videos.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
def create_and_check_llava_next_video_model_fp16_autocast_forward(
self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes
):
config.torch_dtype = torch.float16
model = LlavaNextVideoForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type="cuda", dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
image_sizes=image_sizes,
pixel_values=pixel_values.to(torch.bfloat16),
pixel_values_videos=pixel_values_videos.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextVideoForConditionalGeneration`.
"""
all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
def setUp(self):
self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False)
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if "image_newline" in name:
continue
elif param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def test_inputs_embeds(self):
# overwrite because llava can't support both inputs_embeds and pixel values at ipnut
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["pixel_values_videos"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="Feedforward chunking is not yet supported")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="CPU offload is not yet supported")
def test_cpu_offload(self):
pass
@unittest.skip(
reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)"
)
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(
reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)"
)
def test_sdpa_can_dispatch_on_flash(self):
pass
@require_torch
class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
image_file = hf_hub_download(
repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset"
)
video_file = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
)
self.image = Image.open(image_file)
self.video = np.load(video_file)
self.prompt_image = "USER: <image>\nWhat is shown in this image? ASSISTANT:"
self.prompt_video = "USER: <video>\nWhy is this video funny? ASSISTANT:"
def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
@slow
@require_bitsandbytes
def test_small_model_integration_test(self):
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True, cache_dir="./"
)
inputs = self.processor(self.prompt_video, videos=self.video, return_tensors="pt")
expected_input_ids = [
1,
3148,
1001,
29901,
29871,
32000,
13,
11008,
338,
445,
4863,
2090,
1460,
29973,
319,
1799,
9047,
13566,
29901,
]
self.assertListEqual(expected_input_ids, inputs.input_ids[0].tolist())
# verify single forward pass
inputs = inputs.to(torch_device)
with torch.no_grad():
output = model(**inputs)
# verify generation
output = model.generate(**inputs, do_sample=False, max_new_tokens=40)
EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the book. The child appears to be reading a book, but instead of a calm and focused reading experience' # fmt: skip
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_bitsandbytes
def test_small_model_integration_test_batch(self):
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True, cache_dir="./"
)
inputs = self.processor(
[self.prompt_video, self.prompt_video],
videos=[self.video, self.video],
return_tensors="pt",
padding=True,
).to(torch_device)
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
EXPECTED_DECODED_TEXT = ['USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the', 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the'] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_bitsandbytes
def test_small_model_integration_test_batch_different_vision_types(self):
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf",
load_in_4bit=True,
cache_dir="./",
)
inputs = self.processor(
[self.prompt_image, self.prompt_video],
images=self.image,
videos=self.video,
return_tensors="pt",
padding=True,
).to(torch_device)
# check loss when labels are passed
inputs["labels"] = inputs["input_ids"].clone()
with torch.no_grad():
output = model(**inputs)
self.assertTrue(output.loss is not None)
# verify generation
output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
EXPECTED_DECODED_TEXT = 'USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmark test for a machine learning model. It shows the performance of various models on a task, with the x-axis representing the number of parameters (measured in millions) and the y' # fmt: skip
self.assertEqual(self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT)
@slow
@require_bitsandbytes
def test_small_model_integration_test_batch_matches_single(self):
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True, cache_dir="./"
)
inputs_batched = self.processor(
[self.prompt_video, self.prompt_image],
images=[self.image],
videos=[self.video],
return_tensors="pt",
padding=True,
).to(torch_device)
inputs_single = self.processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
# verify generation
output_batched = model.generate(**inputs_batched, do_sample=False, max_new_tokens=50)
output_single = model.generate(**inputs_single, do_sample=False, max_new_tokens=50)
self.assertEqual(
self.processor.decode(output_batched[0], skip_special_tokens=True),
self.processor.decode(output_single[0], skip_special_tokens=True),
)
...@@ -223,6 +223,14 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe ...@@ -223,6 +223,14 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
def test_training_gradient_checkpointing_use_reentrant_false(self): def test_training_gradient_checkpointing_use_reentrant_false(self):
pass pass
@unittest.skip(reason="Pass because video-LLava requires `attention_mask is not None`")
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Pass because video-LLava requires `attention_mask is not None`")
def test_sdpa_can_dispatch_on_flash(self):
pass
def test_mixed_input(self): def test_mixed_input(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common() config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
......
...@@ -185,6 +185,14 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC ...@@ -185,6 +185,14 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
def test_training_gradient_checkpointing_use_reentrant_false(self): def test_training_gradient_checkpointing_use_reentrant_false(self):
pass pass
@unittest.skip(reason="Compile not yet supported because it is not yet supported in LLava")
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Compile not yet supported because in LLava models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@require_torch @require_torch
class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
......
...@@ -333,7 +333,10 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef, ...@@ -333,7 +333,10 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
original_methods = {f.name.value if hasattr(f, "name") else f: f for f in original_node.body.body} original_methods = {f.name.value if hasattr(f, "name") else f: f for f in original_node.body.body}
updated_methods = {f.name.value if hasattr(f, "name") else f: f for f in updated_node.body.body} updated_methods = {f.name.value if hasattr(f, "name") else f: f for f in updated_node.body.body}
end_meth = [] end_meth = []
for name, func in original_methods.items():
# Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict
for func in original_node.body.body:
name = func.name.value if hasattr(func, "name") else func
if name in updated_methods and updated_methods[name] is not None: if name in updated_methods and updated_methods[name] is not None:
new_params = updated_methods[name].params new_params = updated_methods[name].params
# Replace the method in the replacement class, preserving decorators # Replace the method in the replacement class, preserving decorators
...@@ -347,6 +350,11 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef, ...@@ -347,6 +350,11 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
func = func.with_changes(body=updated_methods[name].body, params=new_params) func = func.with_changes(body=updated_methods[name].body, params=new_params)
end_meth.append(func) end_meth.append(func)
# Port new methods that are defined only in diff-file and append at the end
for name, func in updated_methods.items():
if name not in original_methods and func is not None and isinstance(func, cst.FunctionDef):
end_meth.append(func)
result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth)) result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth))
temp_module = cst.Module(body=[result_node]) temp_module = cst.Module(body=[result_node])
new_module = MetadataWrapper(temp_module) new_module = MetadataWrapper(temp_module)
...@@ -454,7 +462,7 @@ class DiffConverterTransformer(CSTTransformer): ...@@ -454,7 +462,7 @@ class DiffConverterTransformer(CSTTransformer):
) )
super_file_name = self.imported_mapping[super_class] # we need to get the parsed tree super_file_name = self.imported_mapping[super_class] # we need to get the parsed tree
model_name = re.search(r"_(\S*)", super_file_name) model_name = re.search(r"models\.\w*?\.\w*?_(\S*)", super_file_name)
if model_name: if model_name:
model_name = model_name.groups()[0] model_name = model_name.groups()[0]
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment