Unverified Commit 0aa1153f authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Revert error back into warning for byte fallback conversion. (#22607)

parent 1670be4b
...@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid ...@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional. allow to make our dependency on SentencePiece optional.
""" """
import warnings
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
...@@ -450,7 +451,7 @@ class SpmConverter(Converter): ...@@ -450,7 +451,7 @@ class SpmConverter(Converter):
if self.proto.trainer_spec.byte_fallback: if self.proto.trainer_spec.byte_fallback:
if not getattr(self, "handle_byte_fallback", None): if not getattr(self, "handle_byte_fallback", None):
raise RuntimeError( warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the" " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these " " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
......
...@@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase): ...@@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase):
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback) original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with self.assertRaises(RuntimeError) as cm: with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback) _ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn( self.assertIn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers.", " which is not implemented in the fast tokenizers.",
str(cm.exception), str(w[0].message),
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment