Unverified Commit ba8c4d0a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉



* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c65863ce
...@@ -20,7 +20,7 @@ import unittest ...@@ -20,7 +20,7 @@ import unittest
from transformers import is_torch_available from transformers import is_torch_available
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow, torch_device from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, ids_tensor from .test_modeling_common import ModelTesterMixin, ids_tensor
...@@ -29,9 +29,8 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor ...@@ -29,9 +29,8 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
if is_torch_available(): if is_torch_available():
import torch import torch
from transformers import T5Config, T5ForConditionalGeneration, T5Model from transformers import T5Config, T5ForConditionalGeneration, T5Model, T5Tokenizer
from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.tokenization_t5 import T5Tokenizer
class T5ModelTester: class T5ModelTester:
...@@ -546,6 +545,8 @@ def use_task_specific_params(model, task): ...@@ -546,6 +545,8 @@ def use_task_specific_params(model, task):
@require_torch @require_torch
@require_sentencepiece
@require_tokenizers
class T5ModelIntegrationTests(unittest.TestCase): class T5ModelIntegrationTests(unittest.TestCase):
@cached_property @cached_property
def model(self): def model(self):
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
import unittest import unittest
from transformers import is_tf_available from transformers import is_tf_available
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
if is_tf_available(): if is_tf_available():
...@@ -27,6 +27,8 @@ if is_tf_available(): ...@@ -27,6 +27,8 @@ if is_tf_available():
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFCamembertModelIntegrationTest(unittest.TestCase): class TFCamembertModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_output_embeds_base_model(self): def test_output_embeds_base_model(self):
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
import unittest import unittest
from transformers import is_tf_available from transformers import is_tf_available
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
...@@ -332,6 +332,8 @@ class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -332,6 +332,8 @@ class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFFlaubertModelIntegrationTest(unittest.TestCase): class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_output_embeds_base_model(self): def test_output_embeds_base_model(self):
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import is_tf_available from transformers import is_tf_available
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
...@@ -304,6 +304,8 @@ class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -304,6 +304,8 @@ class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFLongformerModelIntegrationTest(unittest.TestCase): class TFLongformerModelIntegrationTest(unittest.TestCase):
def _get_hidden_states(self): def _get_hidden_states(self):
return tf.convert_to_tensor( return tf.convert_to_tensor(
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import RobertaConfig, is_tf_available from transformers import RobertaConfig, is_tf_available
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
...@@ -222,6 +222,8 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -222,6 +222,8 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFRobertaModelIntegrationTest(unittest.TestCase): class TFRobertaModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_inference_masked_lm(self): def test_inference_masked_lm(self):
......
...@@ -18,7 +18,7 @@ import unittest ...@@ -18,7 +18,7 @@ import unittest
from transformers import T5Config, is_tf_available from transformers import T5Config, is_tf_available
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
...@@ -285,6 +285,8 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -285,6 +285,8 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFT5ModelIntegrationTests(unittest.TestCase): class TFT5ModelIntegrationTests(unittest.TestCase):
@cached_property @cached_property
def model(self): def model(self):
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
import unittest import unittest
from transformers import is_tf_available from transformers import is_tf_available
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
if is_tf_available(): if is_tf_available():
...@@ -27,6 +27,8 @@ if is_tf_available(): ...@@ -27,6 +27,8 @@ if is_tf_available():
@require_tf @require_tf
@require_sentencepiece
@require_tokenizers
class TFFlaubertModelIntegrationTest(unittest.TestCase): class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_output_embeds_base_model(self): def test_output_embeds_base_model(self):
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import is_torch_available from transformers import is_torch_available
from transformers.testing_utils import slow from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
if is_torch_available(): if is_torch_available():
...@@ -26,6 +26,8 @@ if is_torch_available(): ...@@ -26,6 +26,8 @@ if is_torch_available():
from transformers import XLMRobertaModel from transformers import XLMRobertaModel
@require_sentencepiece
@require_tokenizers
class XLMRobertaModelIntegrationTest(unittest.TestCase): class XLMRobertaModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_xlm_roberta_base(self): def test_xlm_roberta_base(self):
......
...@@ -10,7 +10,7 @@ from transformers.convert_graph_to_onnx import ( ...@@ -10,7 +10,7 @@ from transformers.convert_graph_to_onnx import (
infer_shapes, infer_shapes,
quantize, quantize,
) )
from transformers.testing_utils import require_tf, require_torch, slow from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
class FuncContiguousArgs: class FuncContiguousArgs:
...@@ -94,6 +94,7 @@ class OnnxExportTestCase(unittest.TestCase): ...@@ -94,6 +94,7 @@ class OnnxExportTestCase(unittest.TestCase):
self.fail(e) self.fail(e)
@require_torch @require_torch
@require_tokenizers
def test_infer_dynamic_axis_pytorch(self): def test_infer_dynamic_axis_pytorch(self):
""" """
Validate the dynamic axis generated for each parameters are correct Validate the dynamic axis generated for each parameters are correct
...@@ -105,6 +106,7 @@ class OnnxExportTestCase(unittest.TestCase): ...@@ -105,6 +106,7 @@ class OnnxExportTestCase(unittest.TestCase):
self._test_infer_dynamic_axis(model, tokenizer, "pt") self._test_infer_dynamic_axis(model, tokenizer, "pt")
@require_tf @require_tf
@require_tokenizers
def test_infer_dynamic_axis_tf(self): def test_infer_dynamic_axis_tf(self):
""" """
Validate the dynamic axis generated for each parameters are correct Validate the dynamic axis generated for each parameters are correct
......
...@@ -3,7 +3,7 @@ from typing import Iterable, List, Optional ...@@ -3,7 +3,7 @@ from typing import Iterable, List, Optional
from transformers import pipeline from transformers import pipeline
from transformers.pipelines import SUPPORTED_TASKS, Conversation, DefaultArgumentHandler, Pipeline from transformers.pipelines import SUPPORTED_TASKS, Conversation, DefaultArgumentHandler, Pipeline
from transformers.testing_utils import require_tf, require_torch, slow, torch_device from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow, torch_device
DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0 DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
...@@ -342,6 +342,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -342,6 +342,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
) )
@require_torch @require_torch
@require_tokenizers
def test_torch_summarization(self): def test_torch_summarization(self):
invalid_inputs = [4, "<mask>"] invalid_inputs = [4, "<mask>"]
mandatory_keys = ["summary_text"] mandatory_keys = ["summary_text"]
...@@ -377,6 +378,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -377,6 +378,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
) )
@require_torch @require_torch
@require_tokenizers
def test_torch_translation(self): def test_torch_translation(self):
invalid_inputs = [4, "<mask>"] invalid_inputs = [4, "<mask>"]
mandatory_keys = ["translation_text"] mandatory_keys = ["translation_text"]
...@@ -399,6 +401,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -399,6 +401,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs) self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs)
@require_torch @require_torch
@require_tokenizers
def test_torch_text2text(self): def test_torch_text2text(self):
invalid_inputs = [4, "<mask>"] invalid_inputs = [4, "<mask>"]
mandatory_keys = ["generated_text"] mandatory_keys = ["generated_text"]
......
...@@ -14,7 +14,13 @@ from transformers.configuration_bart import BartConfig ...@@ -14,7 +14,13 @@ from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig from transformers.configuration_dpr import DPRConfig
from transformers.configuration_rag import RagConfig from transformers.configuration_rag import RagConfig
from transformers.retrieval_rag import RagRetriever from transformers.retrieval_rag import RagRetriever
from transformers.testing_utils import require_datasets, require_faiss, require_torch from transformers.testing_utils import (
require_datasets,
require_faiss,
require_sentencepiece,
require_tokenizers,
require_torch,
)
from transformers.tokenization_bart import BartTokenizer from transformers.tokenization_bart import BartTokenizer
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
...@@ -189,6 +195,8 @@ class RagRetrieverTest(TestCase): ...@@ -189,6 +195,8 @@ class RagRetrieverTest(TestCase):
self.assertListEqual(doc_ids.tolist(), [[1], [0]]) self.assertListEqual(doc_ids.tolist(), [[1], [0]])
@require_torch @require_torch
@require_tokenizers
@require_sentencepiece
def test_hf_index_retriever_call(self): def test_hf_index_retriever_call(self):
import torch import torch
......
...@@ -17,7 +17,8 @@ ...@@ -17,7 +17,8 @@
import os import os
import unittest import unittest
from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast from transformers import AlbertTokenizer, AlbertTokenizerFast
from transformers.testing_utils import require_sentencepiece, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -25,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
@require_sentencepiece
@require_tokenizers
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer tokenizer_class = AlbertTokenizer
......
...@@ -33,6 +33,7 @@ from transformers.testing_utils import ( ...@@ -33,6 +33,7 @@ from transformers.testing_utils import (
DUMMY_DIFF_TOKENIZER_IDENTIFIER, DUMMY_DIFF_TOKENIZER_IDENTIFIER,
DUMMY_UNKWOWN_IDENTIFIER, DUMMY_UNKWOWN_IDENTIFIER,
SMALL_MODEL_IDENTIFIER, SMALL_MODEL_IDENTIFIER,
require_tokenizers,
) )
from transformers.tokenization_auto import TOKENIZER_MAPPING from transformers.tokenization_auto import TOKENIZER_MAPPING
...@@ -70,6 +71,7 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -70,6 +71,7 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
self.assertEqual(tokenizer.vocab_size, 12) self.assertEqual(tokenizer.vocab_size, 12)
@require_tokenizers
def test_tokenizer_identifier_with_correct_config(self): def test_tokenizer_identifier_with_correct_config(self):
for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased") tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
...@@ -82,6 +84,7 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -82,6 +84,7 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertEqual(tokenizer.max_len, 512) self.assertEqual(tokenizer.max_len, 512)
@require_tokenizers
def test_tokenizer_identifier_non_existent(self): def test_tokenizer_identifier_non_existent(self):
for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
with self.assertRaises(EnvironmentError): with self.assertRaises(EnvironmentError):
...@@ -101,12 +104,16 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -101,12 +104,16 @@ class AutoTokenizerTest(unittest.TestCase):
msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__) msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
): ):
self.assertFalse(issubclass(child_config, parent_config)) self.assertFalse(issubclass(child_config, parent_config))
self.assertFalse(issubclass(child_model_py, parent_model_py))
# Check for Slow tokenizer implementation if provided
if child_model_py and parent_model_py:
self.assertFalse(issubclass(child_model_py, parent_model_py))
# Check for Fast tokenizer implementation if provided # Check for Fast tokenizer implementation if provided
if child_model_fast and parent_model_fast: if child_model_fast and parent_model_fast:
self.assertFalse(issubclass(child_model_fast, parent_model_fast)) self.assertFalse(issubclass(child_model_fast, parent_model_fast))
@require_tokenizers
def test_from_pretrained_use_fast_toggle(self): def test_from_pretrained_use_fast_toggle(self):
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer) self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer)
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast) self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast)
...@@ -4,16 +4,19 @@ import unittest ...@@ -4,16 +4,19 @@ import unittest
from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch from transformers.testing_utils import require_tokenizers, require_torch
from transformers.tokenization_roberta import VOCAB_FILES_NAMES from transformers.tokenization_roberta import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
@require_tokenizers
class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BartTokenizer tokenizer_class = BartTokenizer
rust_tokenizer_class = BartTokenizerFast rust_tokenizer_class = BartTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_filter = filter_roberta_detectors
# from_pretrained_kwargs = {'add_prefix_space': True}
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -56,7 +59,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): ...@@ -56,7 +59,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map) kwargs.update(self.special_tokens_map)
return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer" return "lower newer", "lower newer"
...@@ -145,3 +148,38 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): ...@@ -145,3 +148,38 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item()) self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
def test_pretokenized_inputs(self):
pass
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(
tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
...@@ -17,27 +17,29 @@ ...@@ -17,27 +17,29 @@
import os import os
import unittest import unittest
from transformers.testing_utils import slow from transformers import BertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_bert import ( from transformers.tokenization_bert import (
VOCAB_FILES_NAMES, VOCAB_FILES_NAMES,
BasicTokenizer, BasicTokenizer,
BertTokenizer, BertTokenizer,
BertTokenizerFast,
WordpieceTokenizer, WordpieceTokenizer,
_is_control, _is_control,
_is_punctuation, _is_punctuation,
_is_whitespace, _is_whitespace,
) )
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertTokenizer tokenizer_class = BertTokenizer
rust_tokenizer_class = BertTokenizerFast rust_tokenizer_class = BertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
from_pretrained_filter = filter_non_english
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -245,3 +247,55 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -245,3 +247,55 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert encoded_sentence == [101] + text + [102] assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
sentence,
return_attention_mask=False,
return_token_type_ids=False,
return_offsets_mapping=True,
add_special_tokens=True,
)
do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
expected_results = (
[
((0, 0), tokenizer_r.cls_token),
((0, 1), "A"),
((1, 2), ","),
((3, 5), "na"),
((5, 6), "##ï"),
((6, 8), "##ve"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "Allen"),
((21, 23), "##NL"),
((23, 24), "##P"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
if not do_lower_case
else [
((0, 0), tokenizer_r.cls_token),
((0, 1), "a"),
((1, 2), ","),
((3, 8), "naive"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "allen"),
((21, 23), "##nl"),
((23, 24), "##p"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
)
self.assertEqual(
[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
import os import os
import unittest import unittest
from transformers import BertGenerationTokenizer
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow from transformers.testing_utils import require_sentencepiece, require_torch, slow
from transformers.tokenization_bert_generation import BertGenerationTokenizer
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -29,6 +29,7 @@ SPIECE_UNDERLINE = "▁" ...@@ -29,6 +29,7 @@ SPIECE_UNDERLINE = "▁"
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertGenerationTokenizer tokenizer_class = BertGenerationTokenizer
......
...@@ -19,12 +19,12 @@ import pickle ...@@ -19,12 +19,12 @@ import pickle
import unittest import unittest
from transformers.testing_utils import custom_tokenizers from transformers.testing_utils import custom_tokenizers
from transformers.tokenization_bert import WordpieceTokenizer
from transformers.tokenization_bert_japanese import ( from transformers.tokenization_bert_japanese import (
VOCAB_FILES_NAMES, VOCAB_FILES_NAMES,
BertJapaneseTokenizer, BertJapaneseTokenizer,
CharacterTokenizer, CharacterTokenizer,
MecabTokenizer, MecabTokenizer,
WordpieceTokenizer,
) )
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
import os import os
import unittest import unittest
from transformers.testing_utils import _torch_available from transformers import CamembertTokenizer, CamembertTokenizerFast
from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast from transformers.testing_utils import _torch_available, require_sentencepiece, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture ...@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
FRAMEWORK = "pt" if _torch_available else "tf" FRAMEWORK = "pt" if _torch_available else "tf"
@require_sentencepiece
@require_tokenizers
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CamembertTokenizer tokenizer_class = CamembertTokenizer
......
...@@ -14,16 +14,18 @@ ...@@ -14,16 +14,18 @@
# limitations under the License. # limitations under the License.
import inspect
import os import os
import pickle import pickle
import re import re
import shutil import shutil
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict
from itertools import takewhile
from typing import TYPE_CHECKING, Dict, List, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Tuple, Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast, is_torch_available
from transformers.testing_utils import require_tf, require_torch, slow from transformers.testing_utils import get_tests_dir, require_tf, require_tokenizers, require_torch, slow
from transformers.tokenization_utils import AddedToken from transformers.tokenization_utils import AddedToken
...@@ -31,6 +33,18 @@ if TYPE_CHECKING: ...@@ -31,6 +33,18 @@ if TYPE_CHECKING:
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
def filter_non_english(_, pretrained_name: str):
""" Filter all the model for non-english language """
return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
def filter_roberta_detectors(_, pretrained_name: str):
return "detector" not in pretrained_name
def merge_model_tokenizer_mappings( def merge_model_tokenizer_mappings(
model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]], model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]],
tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]], tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]],
...@@ -59,8 +73,32 @@ class TokenizerTesterMixin: ...@@ -59,8 +73,32 @@ class TokenizerTesterMixin:
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = False space_between_special_tokens = False
from_pretrained_kwargs = None
from_pretrained_filter = None
from_pretrained_vocab_key = "vocab_file"
def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
if self.test_rust_tokenizer:
tokenizers_list = [
(
self.rust_tokenizer_class,
pretrained_name,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
)
for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
self.from_pretrained_vocab_key
].keys()
if self.from_pretrained_filter is None
or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
]
self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed
else:
self.tokenizers_list = []
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
def tearDown(self): def tearDown(self):
...@@ -123,6 +161,15 @@ class TokenizerTesterMixin: ...@@ -123,6 +161,15 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"])) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_rust_tokenizer_signature(self):
if not self.test_rust_tokenizer:
return
signature = inspect.signature(self.rust_tokenizer_class.__init__)
self.assertIn("tokenizer_file", signature.parameters)
self.assertIsNone(signature.parameters["tokenizer_file"].default)
def test_rust_and_python_full_tokenizers(self): def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer: if not self.test_rust_tokenizer:
return return
...@@ -206,7 +253,6 @@ class TokenizerTesterMixin: ...@@ -206,7 +253,6 @@ class TokenizerTesterMixin:
shutil.rmtree(tmpdirname) shutil.rmtree(tmpdirname)
# Now let's start the test
tokenizers = self.get_tokenizers(model_max_length=42) tokenizers = self.get_tokenizers(model_max_length=42)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
...@@ -237,6 +283,39 @@ class TokenizerTesterMixin: ...@@ -237,6 +283,39 @@ class TokenizerTesterMixin:
shutil.rmtree(tmpdirname) shutil.rmtree(tmpdirname)
# Test that we can also use the non-legacy saving format for fast tokenizers
tokenizers = self.get_tokenizers(model_max_length=42)
for tokenizer in tokenizers:
if not tokenizer.is_fast:
continue
with self.subTest(f"{tokenizer.__class__.__name__}"):
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile.mkdtemp()
sample_text = " He is very happy, UNwant\u00E9d,running"
tokenizer.add_tokens(["bim", "bambam"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("new_additional_special_token")
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
after_vocab = after_tokenizer.get_vocab()
self.assertListEqual(before_tokens, after_tokens)
self.assertDictEqual(before_vocab, after_vocab)
self.assertIn("bim", after_vocab)
self.assertIn("bambam", after_vocab)
self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
self.assertEqual(after_tokenizer.model_max_length, 42)
tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
self.assertEqual(tokenizer.model_max_length, 43)
shutil.rmtree(tmpdirname)
def test_pickle_tokenizer(self): def test_pickle_tokenizer(self):
"""Google pickle __getstate__ __setstate__ if you are struggling with this.""" """Google pickle __getstate__ __setstate__ if you are struggling with this."""
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
...@@ -258,6 +337,7 @@ class TokenizerTesterMixin: ...@@ -258,6 +337,7 @@ class TokenizerTesterMixin:
self.assertListEqual(subwords, subwords_loaded) self.assertListEqual(subwords, subwords_loaded)
@require_tokenizers
def test_pickle_added_tokens(self): def test_pickle_added_tokens(self):
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True) tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
tok2 = pickle.loads(pickle.dumps(tok1)) tok2 = pickle.loads(pickle.dumps(tok1))
...@@ -419,6 +499,7 @@ class TokenizerTesterMixin: ...@@ -419,6 +499,7 @@ class TokenizerTesterMixin:
self.assertEqual(text_2, output_text) self.assertEqual(text_2, output_text)
@require_tokenizers
def test_encode_decode_with_spaces(self): def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
...@@ -437,6 +518,15 @@ class TokenizerTesterMixin: ...@@ -437,6 +518,15 @@ class TokenizerTesterMixin:
self.assertIn(decoded, [output, output.lower()]) self.assertIn(decoded, [output, output.lower()])
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
# We should have at least one default checkpoint for each tokenizer
# We should specify the max input length as well (used in some part to list the pretrained checkpoints)
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
self.assertEqual(
len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]),
len(self.tokenizer_class.max_model_input_sizes),
)
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = [] weights_lists_2 = []
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items(): for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
...@@ -1226,6 +1316,7 @@ class TokenizerTesterMixin: ...@@ -1226,6 +1316,7 @@ class TokenizerTesterMixin:
encoded_sequences_batch_padded_2[key], encoded_sequences_batch_padded_2[key],
) )
@require_tokenizers
def test_added_token_serializable(self): def test_added_token_serializable(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
...@@ -1652,3 +1743,772 @@ class TokenizerTesterMixin: ...@@ -1652,3 +1743,772 @@ class TokenizerTesterMixin:
self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
self.assertNotIn("decoder_input_ids", batch_encoder_only) self.assertNotIn("decoder_input_ids", batch_encoder_only)
def test_is_fast(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Check is_fast is set correctly
self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast)
def test_fast_only_inputs(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Ensure None raise an error
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
self.assertRaises(TypeError, tokenizer_r.encode, None)
self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
def test_alignement_methods(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words)
batch_size = 3
encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
num_tokens = len(encoding["input_ids"])
last_word_index = len(words) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len(text) - 1
# words, tokens
self.assertEqual(len(encoding.words(0)), num_tokens)
self.assertEqual(max(encoding.words(0)), last_word_index)
self.assertEqual(min(encoding.words(0)), 0)
self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
self.assertEqual(len(encoding.tokens(0)), num_tokens)
# Assert token_to_word
self.assertEqual(encoding.token_to_word(0), 0)
self.assertEqual(encoding.token_to_word(0, 0), 0)
self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
# Assert word_to_tokens
self.assertEqual(encoding.word_to_tokens(0).start, 0)
self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(
batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
)
# Assert token_to_chars
self.assertEqual(encoding.token_to_chars(0).start, 0)
self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
)
# Assert char_to_token
self.assertEqual(encoding.char_to_token(0), 0)
self.assertEqual(encoding.char_to_token(0, 0), 0)
self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
# Assert char_to_word
self.assertEqual(encoding.char_to_word(0), 0)
self.assertEqual(encoding.char_to_word(0, 0), 0)
self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
# Assert word_to_chars
self.assertEqual(encoding.word_to_chars(0).start, 0)
self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
)
def test_tokenization_python_rust_equals(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data)
input_r = tokenizer_r.encode_plus(self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
# Ensure truncation match
input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
# Ensure truncation with stride match
input_p = tokenizer_p.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
input_r = tokenizer_r.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key][0])
def test_num_special_tokens_to_add_equal(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
)
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
)
def test_max_length_equal(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
def test_special_tokens_map_equal(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Assert the set of special tokens match.
self.assertSequenceEqual(
tokenizer_p.special_tokens_map.items(),
tokenizer_r.special_tokens_map.items(),
)
def test_add_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertEqual(len(tokenizer_r), vocab_size + 3)
self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
self.assertRaises(
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
)
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
self.assertEqual(
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
)
self.assertEqual(len(tokenizer_r), vocab_size + 8)
def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair"
# No pair
tokens_with_offsets = tokenizer_r.encode_plus(
text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
# Pairs
tokens_with_offsets = tokenizer_r.encode_plus(
text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
def test_batch_encode_dynamic_overflowing(self):
"""
When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence:
[
Sequence 1: [Encoding 1, Encoding 2],
Sequence 2: [Encoding 1],
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
]
This needs to be padded so that it can represented as a tensor
"""
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
with self.subTest(
"{} ({}, {})".format(tokenizer.__class__.__name__, pretrained_name, tokenizer.__class__.__name__)
):
returned_tensor = "pt" if is_torch_available() else "tf"
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
tokens = tokenizer.encode_plus(
"HuggingFace is solving NLP one commit at a time",
max_length=6,
padding=True,
truncation=True,
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
# Mono sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
# Multi sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
def test_compare_pretokenized_inputs(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
continue # Too hard to test for now
# Input string
pretokenized_input_simple = "This is a sample input".split()
pretokenized_input_pair = "This is a sample pair".split()
# Test encode for pretokenized inputs
output_r = tokenizer_r.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
self.assertEqual(output_p, output_r)
kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
batch_kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test encode for pretokenized inputs pairs
output_r = tokenizer_r.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
self.assertEqual(output_p, output_r)
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
pretokenized_input_simple + pretokenized_input_pair,
pretokenized_input_pair,
]
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
def test_create_token_type_ids(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
input_simple = [1, 2, 3]
input_pair = [1, 2, 3]
# Generate output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_build_inputs_with_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
# # Generate output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
# self.assertEqual(output_p, output_r)
# # Generate pair output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
# Input tokens id
input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
# Generate output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
# Ensure we match max_length
self.assertEqual(len(input_r), max_length)
self.assertEqual(len(input_p), max_length)
# Ensure the number of padded tokens is the same
padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
for i_r in input_r.values():
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
assert_padded_input_match(i_r, i_p, max_length)
for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
self.assertSequenceEqual(i_r, i_p)
# Encode - Simple input
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", padding="longest")
input_p = tokenizer_p.encode("This is a simple input", padding=True)
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode - Pair input
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Encode_plus - Pair input
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Simple input
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="longest",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding=True,
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding="longest"
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding=True
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Batch_encode_plus - Pair input
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding=True,
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding="longest",
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_batch_padded_input_match(input_r, input_p, max_length)
def test_save_pretrained(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil.rmtree(tmpdirname2)
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
add_special_tokens=True,
)
tokens_p = tokenizer_p.encode_plus(
sentence,
add_special_tokens=True,
)
for key in tokens_p.keys():
self.assertEqual(tokens_r[key], tokens_p[key])
if "token_type_ids" in tokens_r:
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode()
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)
# # batch_encode_plus
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
for key in no_special_tokens.keys():
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
def test_compare_prepare_for_model(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False)
)
rust_output = tokenizer_r.prepare_for_model(
tokenizer_r.encode(string_sequence, add_special_tokens=False)
)
for key in python_output:
self.assertEqual(python_output[key], rust_output[key])
...@@ -14,12 +14,13 @@ ...@@ -14,12 +14,13 @@
# limitations under the License. # limitations under the License.
from transformers.testing_utils import slow from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from transformers.testing_utils import require_tokenizers, slow
from .test_tokenization_bert import BertTokenizationTest from .test_tokenization_bert import BertTokenizationTest
@require_tokenizers
class DistilBertTokenizationTest(BertTokenizationTest): class DistilBertTokenizationTest(BertTokenizationTest):
tokenizer_class = DistilBertTokenizer tokenizer_class = DistilBertTokenizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment