Revert error back into warning for byte fallback conversion. (#22607)

0aa1153f · Nicolas Patry · GitHub · 1670be4b · 0aa1153f · 0aa1153f
Unverified Commit 0aa1153f authored Apr 06, 2023 by Nicolas Patry Committed by GitHub Apr 06, 2023
Showing with 6 additions and 3 deletions

src/transformers/convert_slow_tokenizer.py src/transformers/convert_slow_tokenizer.py +2 -1

tests/utils/test_convert_slow_tokenizer.py tests/utils/test_convert_slow_tokenizer.py +4 -2

No files found.
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
 allow to make our dependency on SentencePiece optional.
 """

+import warnings
 from typing import Dict, List, Tuple

 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@@ -450,7 +451,7 @@ class SpmConverter(Converter):

        if self.proto.trainer_spec.byte_fallback:
            if not getattr(self, "handle_byte_fallback", None):
-                raise RuntimeError(
+                warnings.warn(
                    "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
                    " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
                    " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "

--- a/tests/utils/test_convert_slow_tokenizer.py
+++ b/tests/utils/test_convert_slow_tokenizer.py
@@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase):

        original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)

-        with self.assertRaises(RuntimeError) as cm:
+        with warnings.catch_warnings(record=True) as w:
            _ = SpmConverter(original_tokenizer_with_bytefallback)
+        self.assertEqual(len(w), 1)
+
        self.assertIn(
            "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
            " which is not implemented in the fast tokenizers.",
-            str(cm.exception),
+            str(w[0].message),
        )