[ `TokenizationLlama`] fix the way we convert tokens to strings to keep...

[ `TokenizationLlama`] fix the way we convert tokens to strings to keep leading spaces 🚨 breaking fix (#29453) * nit * update test and fix test * fixup

[ `TokenizationLlama`] fix the way we convert tokens to strings to keep...
[ `TokenizationLlama`] fix the way we convert tokens to strings to keep leading spaces 🚨 breaking fix (#29453) * nit * update test and fix test * fixup
a2a7f716 · Arthur · GitHub · e677479c · a2a7f716 · a2a7f716
Unverified Commit a2a7f716 authored Mar 28, 2024 by Arthur Committed by GitHub Mar 28, 2024
Showing with 15 additions and 0 deletions

src/transformers/models/llama/tokenization_llama.py src/transformers/models/llama/tokenization_llama.py +2 -0

tests/models/llama/test_tokenization_llama.py tests/models/llama/test_tokenization_llama.py +13 -0

No files found.
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -295,6 +295,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
                prev_is_special = True
                current_sub_tokens = []
            else:
+                if prev_is_special and i == 1 and self.add_prefix_space and not token.startswith(SPIECE_UNDERLINE):
+                    out_string += " "
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)

--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -581,6 +581,19 @@ class LlamaIntegrationTest(unittest.TestCase):
        decoded_tokens = tokenizer.decode(input_ids)
        self.assertEqual(decoded_tokens, " <s> Hello<s> how")

+        # Let's make sure the space is preserved
+        input_ids = tokenizer.encode("hello", add_special_tokens=True)
+        self.assertEqual(input_ids, [1, 22172])
+        tokens = tokenizer.tokenize("hello")
+        self.assertEqual(tokens, ["▁hello"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, "<s> hello")
+
+        input_ids = tokenizer.encode("hello", add_special_tokens=False)
+        self.assertEqual(input_ids, [22172])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, "hello")
+
    def test_some_edge_cases(self):
        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)