Unverified Commit 476ba679 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

Feature to use the PreTrainedTokenizerFast class as a stand-alone tokenizer (#11810)



* feature for tokenizer without slow/legacy version

* format

* modify common test

* add tests

* add PreTrainedTokenizerFast to AutoTokenizer

* format

* change tokenizer common test in order to be able to run test without a slow version

* update tokenizer fast test in order to use `rust_tokenizer_class` attribute instead of `tokenizer_class`

* add autokenizer test

* replace  `if self.tokenizer_class is not None` with ` if self.tokenizer_class is None`

* remove obsolete change in comment

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* Update src/transformers/tokenization_utils_fast.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* change `get_main_tokenizer` into `get_tokenizers`

* clarify `get_tokenizers` method

* homogenize with `test_slow_tokenizer` and `test_rust_tokenizer`

* add `test_rust_tokenizer = False` to tokenizer which don't define a fast version

* `test_rust_tokenizer = False` for BertJapaneseTokenizer

* `test_rust_tokenizer = False` for BertJapaneseCharacterTokenizationTest
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 4a51b1dd
...@@ -157,6 +157,7 @@ else: ...@@ -157,6 +157,7 @@ else:
Speech2TextTokenizer = None Speech2TextTokenizer = None
if is_tokenizers_available(): if is_tokenizers_available():
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ..albert.tokenization_albert_fast import AlbertTokenizerFast from ..albert.tokenization_albert_fast import AlbertTokenizerFast
from ..bart.tokenization_bart_fast import BartTokenizerFast from ..bart.tokenization_bart_fast import BartTokenizerFast
from ..barthez.tokenization_barthez_fast import BarthezTokenizerFast from ..barthez.tokenization_barthez_fast import BarthezTokenizerFast
...@@ -223,6 +224,7 @@ else: ...@@ -223,6 +224,7 @@ else:
T5TokenizerFast = None T5TokenizerFast = None
XLMRobertaTokenizerFast = None XLMRobertaTokenizerFast = None
XLNetTokenizerFast = None XLNetTokenizerFast = None
PreTrainedTokenizerFast = None
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
...@@ -297,6 +299,7 @@ NO_CONFIG_TOKENIZER = [ ...@@ -297,6 +299,7 @@ NO_CONFIG_TOKENIZER = [
BarthezTokenizerFast, BarthezTokenizerFast,
MBart50Tokenizer, MBart50Tokenizer,
MBart50TokenizerFast, MBart50TokenizerFast,
PreTrainedTokenizerFast,
] ]
......
...@@ -1872,14 +1872,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1872,14 +1872,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved. save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
legacy_format (:obj:`bool`, `optional`): legacy_format (:obj:`bool`, `optional`):
Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
format as well as in legacy format, i.e. with tokenizer specific vocabulary and a separate added_tokens format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
files. added_tokens files.
If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
be loaded in the corresponding "slow" tokenizer. be loaded in the corresponding "slow" tokenizer.
If :obj:`True`, will save the tokenizer in legacy format. If :obj:`True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a
value error is raised.
filename_prefix: (:obj:`str`, `optional`): filename_prefix: (:obj:`str`, `optional`):
A prefix to add to the names of the files saved by the tokenizer. A prefix to add to the names of the files saved by the tokenizer.
......
...@@ -525,7 +525,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -525,7 +525,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
""" """
save_directory = str(save_directory) save_directory = str(save_directory)
save_slow = legacy_format is None or legacy_format is True if self.slow_tokenizer_class is None and legacy_format is True:
raise ValueError(
"Your tokenizer does not have a legacy version defined and therefore cannot register this version. You "
"might consider leaving the legacy_format at `None` or setting it to `False`."
)
save_slow = (legacy_format is None or legacy_format is True) and self.slow_tokenizer_class is not None
save_fast = legacy_format is None or legacy_format is False save_fast = legacy_format is None or legacy_format is False
if save_slow: if save_slow:
......
...@@ -24,6 +24,7 @@ from transformers import ( ...@@ -24,6 +24,7 @@ from transformers import (
BertTokenizerFast, BertTokenizerFast,
GPT2Tokenizer, GPT2Tokenizer,
GPT2TokenizerFast, GPT2TokenizerFast,
PreTrainedTokenizerFast,
RobertaTokenizer, RobertaTokenizer,
RobertaTokenizerFast, RobertaTokenizerFast,
) )
...@@ -119,3 +120,12 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -119,3 +120,12 @@ class AutoTokenizerTest(unittest.TestCase):
tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False) tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
tokens = tokenizer.tokenize(sample) tokens = tokenizer.tokenize(sample)
self.assertEqual("[UNK]", tokens[0]) self.assertEqual("[UNK]", tokens[0])
@require_tokenizers
def test_PreTrainedTokenizerFast_from_pretrained(self):
tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
self.assertEqual(tokenizer.model_max_length, 512)
self.assertEqual(tokenizer.vocab_size, 30000)
self.assertEqual(tokenizer.unk_token, "[UNK]")
self.assertEqual(tokenizer.padding_side, "right")
...@@ -32,6 +32,7 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture ...@@ -32,6 +32,7 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertGenerationTokenizer tokenizer_class = BertGenerationTokenizer
test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): def setUp(self):
......
...@@ -35,6 +35,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -35,6 +35,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): def setUp(self):
...@@ -204,6 +205,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -204,6 +205,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertweetTokenizer tokenizer_class = BertweetTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -30,6 +30,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -30,6 +30,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CLIPTokenizer tokenizer_class = CLIPTokenizer
rust_tokenizer_class = CLIPTokenizerFast rust_tokenizer_class = CLIPTokenizerFast
test_rust_tokenizer = False
from_pretrained_kwargs = {"add_prefix_space": True} from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False test_seq2seq = False
......
This diff is collapsed.
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import PreTrainedTokenizerFast
from transformers.testing_utils import require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = PreTrainedTokenizerFast
test_slow_tokenizer = False
test_rust_tokenizer = True
from_pretrained_vocab_key = "tokenizer_file"
def setUp(self):
self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
super().setUp()
self.test_rust_tokenizer = True
self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})]
tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast")
tokenizer.save_pretrained(self.tmpdirname)
def test_pretrained_model_lists(self):
# We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
# model
pass
def test_prepare_for_model(self):
# We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
# model
pass
def test_rust_tokenizer_signature(self):
# PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
pass
...@@ -31,6 +31,7 @@ FSMT_TINY2 = "stas/tiny-wmt19-en-ru" ...@@ -31,6 +31,7 @@ FSMT_TINY2 = "stas/tiny-wmt19-en-ru"
class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FSMTTokenizer tokenizer_class = FSMTTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class Luke(TokenizerTesterMixin, unittest.TestCase): class Luke(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LukeTokenizer tokenizer_class = LukeTokenizer
test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"} from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self): def setUp(self):
......
...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -24,6 +24,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PhobertTokenizer tokenizer_class = PhobertTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -29,6 +29,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -29,6 +29,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BlenderbotSmallTokenizer tokenizer_class = BlenderbotSmallTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment