Unverified Commit 8a312956 authored by Pablo Montalvo's avatar Pablo Montalvo Committed by GitHub
Browse files

Fuyu: improve image processing (#27007)



* Fix Fuyu image scaling bug

It could produce negative padding and hence inference errors for certain
image sizes.

* initial rework commit

* add batching capabilities, refactor image processing

* add functional batching for a list of images and texts

* make args explicit

* Fuyu processing update (#27133)

* Add file headers

* Add file headers

* First pass - preprocess method with standard args

* First pass image processor rework

* Small tweaks

* More args and docstrings

* Tidying iterating over batch

* Tidying up

* Modify to have quick tests (for now)

* Fix up

* BatchFeature

* Passing tests

* Add tests for processor

* Sense check when patchifying

* Add some tests

* FuyuBatchFeature

* Post-process box coordinates

* Update to `size` in processor

* Remove unused and duplicate constants

* Store unpadded dims after resize

* Fix up

* Return FuyuBatchFeature

* Get unpadded sizes after resize

* Update exception

* Fix return

* Convert input `<box>` coordinates to model format.

* Post-process point coords, support multiple boxes/points in a single
sequence

* Replace constants

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Preprocess List[List[image]]

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Update to Amy's latest state.

* post-processing returns a list of tensors

* Fix error when target_sizes is None
Co-authored-by: default avatarPablo Montalvo <pablo.montalvo.leroux@gmail.com>

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Review comments

* Update src/transformers/models/fuyu/image_processing_fuyu.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>

* Fix up

* Fix up

---------
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-72-126.ec2.internal>
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
Co-authored-by: default avatarPablo Montalvo <pablo.montalvo.leroux@gmail.com>

* Fix conflicts in fuyu_follow_up_image_processing (#27228)

fixing conflicts and updating on main

* Revert "Fix conflicts in fuyu_follow_up_image_processing" (#27232)

Revert "Fix conflicts in fuyu_follow_up_image_processing (#27228)"

This reverts commit acce10b6c653dc7041fb9d18cfed55775afd6207.

---------
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-72-126.ec2.internal>
parent 9b25c164
......@@ -112,17 +112,9 @@ class BatchFeature(UserDict):
def items(self):
return self.data.items()
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
"""
Convert the inner content to tensors.
Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
`None`, no modification is done.
"""
def _get_is_as_tensor_fns(self, tensor_type: Optional[Union[str, TensorType]] = None):
if tensor_type is None:
return self
return None, None
# Convert to TensorType
if not isinstance(tensor_type, TensorType):
......@@ -167,6 +159,21 @@ class BatchFeature(UserDict):
return np.asarray(value, dtype=dtype)
is_tensor = is_numpy_array
return is_tensor, as_tensor
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
"""
Convert the inner content to tensors.
Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
`None`, no modification is done.
"""
if tensor_type is None:
return self
is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
# Do the tensor conversion in batch
for key, value in self.items():
......
......@@ -257,8 +257,10 @@ class FuyuForCausalLM(FuyuPreTrainedModel):
if inputs_embeds is None:
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
if image_patches is not None and past_key_values is None:
patch_embeddings = self.vision_embed_tokens(image_patches.to(self.vision_embed_tokens.weight.dtype))
patch_embeddings = patch_embeddings.to(inputs_embeds.device)
patch_embeddings = [
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)).squeeze(0)
for patch in image_patches
]
inputs_embeds = self.gather_continuous_embeddings(
word_embeddings=inputs_embeds,
continuous_embeddings=patch_embeddings,
......
......@@ -24,7 +24,8 @@ if is_vision_available():
@require_torchvision
class TestFuyuImageProcessor(unittest.TestCase):
def setUp(self):
self.processor = FuyuImageProcessor(target_height=160, target_width=320, padding_value=1.0)
self.size = {"height": 160, "width": 320}
self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0)
self.batch_size = 3
self.channels = 3
self.height = 300
......@@ -38,29 +39,25 @@ class TestFuyuImageProcessor(unittest.TestCase):
self.sample_image_pil = Image.fromarray(self.sample_image)
def test_patches(self):
expected_num_patches = self.processor.get_num_patches(
img_h=self.height, img_w=self.width, patch_dim_h=self.image_patch_dim_h, patch_dim_w=self.image_patch_dim_w
)
expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
patches_final = self.processor.patchify_image(
image=self.image_input, patch_dim_h=self.image_patch_dim_h, patch_dim_w=self.image_patch_dim_w
)
patches_final = self.processor.patchify_image(image=self.image_input)
assert (
patches_final.shape[1] == expected_num_patches
), f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
def test_scale_to_target_aspect_ratio(self):
# (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
scaled_image = self.processor._scale_to_target_aspect_ratio(self.sample_image)
scaled_image = self.processor.resize(self.sample_image, size=self.size)
self.assertEqual(scaled_image.shape[0], 160)
self.assertEqual(scaled_image.shape[1], 74)
def test_apply_transformation_numpy(self):
transformed_image = self.processor.apply_transformation(self.sample_image)
self.assertEqual(transformed_image.shape[0], 160)
self.assertEqual(transformed_image.shape[1], 320)
transformed_image = self.processor.preprocess(self.sample_image).images[0][0]
self.assertEqual(transformed_image.shape[1], 160)
self.assertEqual(transformed_image.shape[2], 320)
def test_apply_transformation_pil(self):
transformed_image = self.processor.apply_transformation(self.sample_image_pil)
self.assertEqual(transformed_image.shape[0], 160)
self.assertEqual(transformed_image.shape[1], 320)
transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0]
self.assertEqual(transformed_image.shape[1], 160)
self.assertEqual(transformed_image.shape[2], 320)
......@@ -3,7 +3,7 @@ import unittest
import requests
from transformers import AutoTokenizer, FuyuConfig, is_torch_available, is_vision_available
from transformers import FuyuConfig, is_torch_available, is_vision_available
from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
from ...test_modeling_common import ids_tensor, random_attention_mask
......@@ -14,7 +14,7 @@ if is_vision_available():
if is_torch_available() and is_vision_available():
from transformers import FuyuImageProcessor, FuyuProcessor
from transformers import FuyuProcessor
if is_torch_available():
......@@ -267,11 +267,8 @@ class FuyuIntegrationTest(unittest.TestCase): # , ModelTesterMixin)
all_model_classes = ("FuyuForCausalLM") if is_torch_available() else ()
def setUp(self):
self.pretrained_model_name = "huggingface/new_model_release_weights"
tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name)
image_processor = FuyuImageProcessor()
self.processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
self.pretrained_model_name = "adept/fuyu-8b"
self.processor = FuyuProcessor.from_pretrained(self.pretrained_model_name)
self.model = FuyuForCausalLM.from_pretrained(self.pretrained_model_name)
self.bus_image_url = (
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
......@@ -280,9 +277,8 @@ class FuyuIntegrationTest(unittest.TestCase): # , ModelTesterMixin)
@slow
def test_model_8b_chat_greedy_generation_bus_captioning(self):
EXPECTED_TEXT_COMPLETION = """A bus parked on the side of a road.|ENDOFTEXT|"""
EXPECTED_TEXT_COMPLETION = """A blue bus parked on the side of a road.|ENDOFTEXT|"""
text_prompt_coco_captioning = "Generate a coco-style caption.\n"
model_inputs_bus_captioning = self.processor(text=text_prompt_coco_captioning, images=self.bus_image_pil)
generated_tokens = self.model.generate(**model_inputs_bus_captioning, max_new_tokens=10)
text = self.processor.tokenizer.batch_decode(generated_tokens)
......@@ -297,7 +293,7 @@ class FuyuIntegrationTest(unittest.TestCase): # , ModelTesterMixin)
"""
@slow
@require_torch_gpu
@require_torch_accelerator
def test_model_8b_chat_greedy_generation_bus_color(self):
EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|"
text_prompt_bus_color = "What color is the bus?\n"
......@@ -314,7 +310,7 @@ class FuyuIntegrationTest(unittest.TestCase): # , ModelTesterMixin)
self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
@slow
@require_torch_gpu
@require_torch_accelerator
def test_model_8b_chat_greedy_generation_chart_vqa(self):
# fmt: off
EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",]
......@@ -340,7 +336,7 @@ class FuyuIntegrationTest(unittest.TestCase): # , ModelTesterMixin)
self.assertEqual(expected_text_completion, clean_sequence)
@slow
@require_torch_gpu
@require_torch_accelerator
def test_model_8b_chat_greedy_generation_bounding_box(self):
EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|"
text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams" # noqa: E231
......
......@@ -26,16 +26,14 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
""" """
def setUp(self):
pretrained_model_name = "huggingface/pre_release_model"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
image_processor = FuyuImageProcessor()
pretrained_model_name = "adept/fuyu-8b"
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
self.image_processor = FuyuImageProcessor()
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
text_prompt = "Generate a coco-style caption.\\n"
self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
self.text_prompt = "Generate a coco-style caption.\\n"
bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
self.one_image_bus_model_inputs = processor(text=text_prompt, images=bus_image_pil)
self.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
def test_fuyu_processing(self):
"""
......@@ -44,11 +42,119 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
# fmt: off
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
one_image_bus_model_inputs = self.processor(text=self.text_prompt, images=self.bus_image_pil)
# fmt: on
torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
torch.testing.assert_close(one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS)
def test_fuyu_processing_no_image(self):
"""
Test to check processor works with just text input
"""
processor_outputs = self.processor(text=self.text_prompt)
tokenizer_outputs = self.tokenizer(self.text_prompt)
self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"])
def test_fuyu_processing_no_text(self):
"""
Test to check processor works with just image input
"""
# fmt: off
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93,
94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160,
161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200,
201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227,
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267,
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294,
295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
]).to(torch.int64)
# fmt: on
processor_outputs = self.processor(images=self.bus_image_pil)
self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all())
def test_fuyu_processing_multiple_image_sample(self):
"""
Test to check processor works with multiple image inputs for a single text input
"""
# fmt: off
SINGLE_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
SINGLE_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
SINGLE_RESIZED_IMAGE_PATCH_INPUTS = torch.Tensor([[ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, 12, 13, 14, -1, 15, 16, 17, -1, 18, 19, 20, -1, 21, 22, 23, -1, 24, 25, 26, -1, 27, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]])
SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122]])
# fmt: on
torch.testing.assert_close(
self.one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS
# Batch of two images - equally sized
images = [self.bus_image_pil, self.bus_image_pil]
processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
self.assertTrue(
(
processor_outputs["image_patches_indices"]
== torch.cat([SINGLE_IMAGE_PATCH_INPUTS, SINGLE_IMAGE_PATCH_INPUTS], dim=0)
).all()
)
self.assertTrue(
(
processor_outputs["input_ids"]
== torch.cat([SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, SINGLE_PADDED_UNPACKED_TOKEN_INPUTS], dim=0)
).all()
)
torch.testing.assert_close(self.one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS)
# Processes single images with different sizes as expected
images = [self.bus_image_pil]
processor_outputs = self.processor(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())
images = [self.bus_image_pil.resize((64, 300))]
processor_outputs = self.processor(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())
# Batch of two images - different sizes. Left-pads the smaller image inputs
images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
padded_single_resized_image_patch = torch.cat(
[torch.ones([1, padding_len_patch]) * -1, SINGLE_RESIZED_IMAGE_PATCH_INPUTS], dim=1
)
expected_image_patch_inputs = torch.cat([SINGLE_IMAGE_PATCH_INPUTS, padded_single_resized_image_patch], dim=0)
padding_len_token = (
SINGLE_PADDED_UNPACKED_TOKEN_INPUTS.shape[1] - SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS.shape[1]
)
padded_single_resized_padded_unpacked_token_inputs = torch.cat(
[torch.zeros([1, padding_len_token]), SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS], dim=1
)
expected_padded_unpacked_token_inputs = torch.cat(
[SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, padded_single_resized_padded_unpacked_token_inputs], dim=0
)
self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all())
self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all())
@require_torch
......@@ -97,7 +203,6 @@ class TestProcessImagesForModelInput(unittest.TestCase):
"""
Adding a mix of present and absent images.
"""
self.image_processor = FuyuImageProcessor()
self.image_input = torch.randn([1, 1, 3, 64, 64])
self.image_present = torch.tensor([[1]])
......@@ -108,19 +213,19 @@ class TestProcessImagesForModelInput(unittest.TestCase):
self.image_placeholder_id = 999
self.image_newline_id = 888
self.variable_sized = True
self.image_processor = FuyuImageProcessor(
patch_size={"height": self.image_patch_dim_h, "width": self.image_patch_dim_w}
)
def test_process_images_for_model_input_fixed_sized(self):
self.variable_sized = False
result = self.image_processor.process_images_for_model_input(
result = self.image_processor.preprocess_with_tokenizer_info(
image_input=self.image_input,
image_present=self.image_present,
image_unpadded_h=self.image_unpadded_h,
image_unpadded_w=self.image_unpadded_w,
image_patch_dim_h=self.image_patch_dim_h,
image_patch_dim_w=self.image_patch_dim_w,
image_placeholder_id=self.image_placeholder_id,
image_newline_id=self.image_newline_id,
variable_sized=self.variable_sized,
)
print(result["images"][0][0])
self.assertEqual(result["images"][0][0].shape, torch.Size([3, 64, 64]))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment