Unverified Commit 1b20e2bb authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Fix `Kosmos2Processor` batch mode (#27323)



* fix

* fix

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent a6e0d5a2
...@@ -211,7 +211,9 @@ class Kosmos2Processor(ProcessorMixin): ...@@ -211,7 +211,9 @@ class Kosmos2Processor(ProcessorMixin):
image_embeds_position_mask.append(mask) image_embeds_position_mask.append(mask)
if isinstance(text, list): if isinstance(text, list):
sorted_length = sorted([(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)]) sorted_length = sorted(
[(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)], key=lambda x: x[-1]
)
_, min_len_not_padded = sorted_length[0] _, min_len_not_padded = sorted_length[0]
idx, _ = sorted_length[-1] idx, _ = sorted_length[-1]
......
...@@ -686,7 +686,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): ...@@ -686,7 +686,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
prompt = ["<grounding>An image of", "<grounding>Describe this image in detail:"] prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
# left padding # left padding
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left") processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
...@@ -699,10 +699,6 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): ...@@ -699,10 +699,6 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
# left padding gives identical results as non-padding # left padding gives identical results as non-padding
EXPECTED_PROCESSED_TEXT_0 = ( EXPECTED_PROCESSED_TEXT_0 = (
"<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
"warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
)
EXPECTED_PROCESSED_TEXT_1 = (
"<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire" "<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire"
"</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat" "</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat"
"</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>" "</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>"
...@@ -712,21 +708,21 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): ...@@ -712,21 +708,21 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
"nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy " "nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy "
"atmosphere." "atmosphere."
) )
EXPECTED_PROCESSED_TEXT_1 = (
"<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
"warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
)
self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1]) self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1])
EXPECTED_FINAL_TEXT_0 = "An image of a snowman warming himself by a fire." EXPECTED_FINAL_TEXT_0 = (
EXPECTED_FINAL_TEXT_1 = (
"Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is " "Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is "
"wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be " "wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be "
"enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere." "enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere."
) )
EXPECTED_FINAL_TEXT_1 = "An image of a snowman warming himself by a fire."
self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1]) self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1])
EXPECTED_ENTITIES_0 = [ EXPECTED_ENTITIES_0 = [
("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
]
EXPECTED_ENTITIES_1 = [
("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]), ("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]),
("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]), ("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]),
("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]), ("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]),
...@@ -734,6 +730,10 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): ...@@ -734,6 +730,10 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]), ("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]),
("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]), ("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]),
] ]
EXPECTED_ENTITIES_1 = [
("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
]
self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1]) self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1])
# right padding # right padding
...@@ -746,6 +746,6 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): ...@@ -746,6 +746,6 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
all_entities = [x[1] for x in final_text_with_entities] all_entities = [x[1] for x in final_text_with_entities]
# For right padding, only the non-padded sequences will give the same results as non-padding # For right padding, only the non-padded sequences will give the same results as non-padding
self.assertEqual(processed_text[1], EXPECTED_PROCESSED_TEXT_1) self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0)
self.assertEqual(all_final_text[1], EXPECTED_FINAL_TEXT_1) self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0)
self.assertListEqual(all_entities[1], EXPECTED_ENTITIES_1) self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment