[`Llava`] + CIs fix red cis and llava integration tests (#30440)

* nit * nit and fmt skip * fixup * Update src/transformers/convert_slow_tokenizer.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * set to true --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

[`Llava`] + CIs fix red cis and llava integration tests (#30440)
* nit * nit and fmt skip * fixup * Update src/transformers/convert_slow_tokenizer.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * set to true --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
9a4a119c · Arthur · GitHub · 767e3518 · 9a4a119c · 9a4a119c
Unverified Commit 9a4a119c authored Apr 24, 2024 by Arthur Committed by GitHub Apr 24, 2024
Showing with 5 additions and 13 deletions

src/transformers/convert_slow_tokenizer.py src/transformers/convert_slow_tokenizer.py +2 -2

tests/models/llava/test_modeling_llava.py tests/models/llava/test_modeling_llava.py +3 -11

No files found.
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -46,7 +46,7 @@ def import_protobuf(error_message=""):
 def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
    if add_prefix_space:
        prepend_scheme = "always"
-        if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
+        if not getattr(original_tokenizer, "legacy", True):
            prepend_scheme = "first"
    else:
        prepend_scheme = "never"
@@ -1393,7 +1393,7 @@ class LlamaConverter(SpmConverter):
        return tokenizer
    def normalizer(self, proto):
-        if self.original_tokenizer.legacy:
+        if getattr(self.original_tokenizer, "legacy", True):
            sequence = []
            if getattr(self.original_tokenizer, "add_prefix_space"):
                sequence += [normalizers.Prepend(prepend="▁")]

--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -591,14 +591,6 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        fast_tokenizer.add_tokens("<image>", True)
        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-        # If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens.
+        EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
-        # https://github.com/huggingface/transformers/pull/28881 is the fix for this.
+        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-        self.assertEqual(
+        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-            slow_tokenizer.tokenize(prompt),
-            ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']
-        )  # fmt: skip
-        self.assertEqual(
-            fast_tokenizer.tokenize(prompt),
-            ['<|im_start|>', '▁system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', '▁user', '\n', '<image>', '▁', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', '▁assistant', '\n']
-        )  # fmt: skip