add a warning in `SpmConverter` for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model * Revert "update proto sentencepiece model" This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229. * add check * add test * Revert "Revert "update proto sentencepiece model"" This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d. * test for log level * test for log level 2 * warning at the warning level * clean * format * add explanation in docstring

add a warning in `SpmConverter` for sentencepiece's model using the byte fallback feature (#16629)
* update proto sentencepiece model * Revert "update proto sentencepiece model" This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229. * add check * add test * Revert "Revert "update proto sentencepiece model"" This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d. * test for log level * test for log level 2 * warning at the warning level * clean * format * add explanation in docstring
1025a9b7 · SaulLu · GitHub · 7c5d7991 · 1025a9b7 · 1025a9b7
Unverified Commit 1025a9b7 authored Apr 11, 2022 by SaulLu Committed by GitHub Apr 11, 2022
4 changed files
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
 allow to make our dependency on SentencePiece optional.
 """

+import warnings
 from typing import Dict, List, Tuple

 from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@@ -429,6 +430,14 @@ class SpmConverter(Converter):
            m.ParseFromString(f.read())
        self.proto = m

+        if self.proto.trainer_spec.byte_fallback:
+            warnings.warn(
+                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
+                " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
+                " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
+                "unknown tokens into a sequence of byte tokens matching the original piece of text."
+            )
+
    def vocab(self, proto):
        return [(piece.piece, piece.score) for piece in proto.pieces]


--- a/src/transformers/utils/sentencepiece_model_pb2.py
+++ b/src/transformers/utils/sentencepiece_model_pb2.py
--- a/tests/fixtures/test_sentencepiece_with_bytefallback.model
+++ b/tests/fixtures/test_sentencepiece_with_bytefallback.model
--- a/tests/utils/test_convert_slow_tokenizer.py
+++ b/tests/utils/test_convert_slow_tokenizer.py
+import unittest
+import warnings
+from dataclasses import dataclass
+
+from transformers.convert_slow_tokenizer import SpmConverter
+from transformers.testing_utils import get_tests_dir
+
+
+@dataclass
+class FakeOriginalTokenizer:
+    vocab_file: str
+
+
+class ConvertSlowTokenizerTest(unittest.TestCase):
+    def test_spm_converter_bytefallback_warning(self):
+        spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
+        spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
+
+        original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
+
+        with warnings.catch_warnings(record=True) as w:
+            _ = SpmConverter(original_tokenizer_without_bytefallback)
+        self.assertEqual(len(w), 0)
+
+        original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
+
+        with warnings.catch_warnings(record=True) as w:
+            _ = SpmConverter(original_tokenizer_with_bytefallback)
+        self.assertEqual(len(w), 1)
+        self.assertIn(
+            (
+                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
+                " which is not implemented in the fast tokenizers."
+            ),
+            str(w[0].message),
+        )