Unverified Commit 9a4a119c authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`Llava`] + CIs fix red cis and llava integration tests (#30440)



* nit

* nit and fmt skip

* fixup

* Update src/transformers/convert_slow_tokenizer.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* set to true

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 767e3518
...@@ -46,7 +46,7 @@ def import_protobuf(error_message=""): ...@@ -46,7 +46,7 @@ def import_protobuf(error_message=""):
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space: if add_prefix_space:
prepend_scheme = "always" prepend_scheme = "always"
if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy: if not getattr(original_tokenizer, "legacy", True):
prepend_scheme = "first" prepend_scheme = "first"
else: else:
prepend_scheme = "never" prepend_scheme = "never"
...@@ -1393,7 +1393,7 @@ class LlamaConverter(SpmConverter): ...@@ -1393,7 +1393,7 @@ class LlamaConverter(SpmConverter):
return tokenizer return tokenizer
def normalizer(self, proto): def normalizer(self, proto):
if self.original_tokenizer.legacy: if getattr(self.original_tokenizer, "legacy", True):
sequence = [] sequence = []
if getattr(self.original_tokenizer, "add_prefix_space"): if getattr(self.original_tokenizer, "add_prefix_space"):
sequence += [normalizers.Prepend(prepend="▁")] sequence += [normalizers.Prepend(prepend="▁")]
......
...@@ -591,14 +591,6 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): ...@@ -591,14 +591,6 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
fast_tokenizer.add_tokens("<image>", True) fast_tokenizer.add_tokens("<image>", True)
prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
# If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens. EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] # fmt: skip
# https://github.com/huggingface/transformers/pull/28881 is the fix for this. self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
self.assertEqual( self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
slow_tokenizer.tokenize(prompt),
['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']
) # fmt: skip
self.assertEqual(
fast_tokenizer.tokenize(prompt),
['<|im_start|>', '▁system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', '▁user', '\n', '<image>', '▁', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', '▁assistant', '\n']
) # fmt: skip
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment