Unverified Commit 1025a9b7 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

add a warning in `SpmConverter` for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
parent 7c5d7991
......@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional.
"""
import warnings
from typing import Dict, List, Tuple
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
......@@ -429,6 +430,14 @@ class SpmConverter(Converter):
m.ParseFromString(f.read())
self.proto = m
if self.proto.trainer_spec.byte_fallback:
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
"unknown tokens into a sequence of byte tokens matching the original piece of text."
)
def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]
......
import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment