"docs/source/tokenizer_summary.mdx" did not exist on "0f4e39c5599523c110cd713f60a3bfa145dad807"
Unverified Commit 7223844d authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

Change how "additional_special_tokens" argument in the ".from_pretrained"...

Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)

* add test

* add change in PretrainedTokenizerBase

* change Luke

* deactivate

* add the possibility to add additional special tokens for M2M100

* format

* add special test for canine

* proposed changes for mbart

* proposed changes for mbart50

* proposed changes for byt5

* proposed changes for canine

* proposed changes for t5

* test fast and slow

* remove comment

* remove comment

* add fast version for all tests

* replace break by continue

* add more comments

* add check to avoid duplicates

* remove comment

* format

* proposed change for wave2vec2

* reverse changes mbart

* uncomment

* format
parent b13c6c18
......@@ -210,8 +210,8 @@ class LukeTokenizer(RobertaTokenizer):
if isinstance(entity_token_2, str)
else entity_token_2
)
kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2]
kwargs["additional_special_tokens"] += kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
super().__init__(
vocab_file=vocab_file,
......
......@@ -137,6 +137,15 @@ class M2M100Tokenizer(PreTrainedTokenizer):
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
self.get_lang_token(lang_code)
for lang_code in FAIRSEQ_LANGUAGE_CODES
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
]
super().__init__(
src_lang=src_lang,
tgt_lang=tgt_lang,
......@@ -157,14 +166,11 @@ class M2M100Tokenizer(PreTrainedTokenizer):
self.encoder_size = len(self.encoder)
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
self.lang_token_to_id = {
self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)
}
self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)}
self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
self._additional_special_tokens = list(self.lang_token_to_id.keys())
self._src_lang = src_lang if src_lang is not None else "en"
self.tgt_lang = tgt_lang
......
......@@ -130,6 +130,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
super().__init__(
src_lang=src_lang,
tgt_lang=tgt_lang,
......@@ -168,7 +173,6 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
self._additional_special_tokens = list(self.lang_code_to_id.keys())
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
......
......@@ -125,6 +125,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
super().__init__(
vocab_file,
src_lang=src_lang,
......@@ -141,7 +146,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self.vocab_file = vocab_file
self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
}
......
......@@ -1862,6 +1862,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle)
for key, value in special_tokens_map.items():
if key in kwargs and kwargs[key]:
# This value has already been redefined by the kwargs
# We keep this new value and ignore the one stored in the special_tokens_map_file
continue
if isinstance(value, dict):
value = AddedToken(**value)
elif isinstance(value, list):
......
......@@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs):
kwargs.update(self.add_kwargs_tokens_map)
def get_tokenizer(self, **kwargs_init):
kwargs = self.add_kwargs_tokens_map.copy()
kwargs.update(kwargs_init)
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
......
......@@ -13,11 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from transformers import BatchEncoding, ByT5Tokenizer
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
from transformers.file_utils import cached_property, is_tf_available, is_torch_available
from .test_tokenization_common import TokenizerTesterMixin
......@@ -161,6 +163,72 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
shutil.rmtree(tmpdirname)
# There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens
# We need to add the extra_ids in the list of the arg additional_special_tokens
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(125)]
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
def test_pretrained_model_lists(self):
pass
......
......@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
......@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
# a special token for Canine can be defined as follows:
NEW_TOKEN = 0xE006
new_token_1 = chr(NEW_TOKEN)
special_tokens_map["additional_special_tokens"] = [new_token_1]
tokenizer_config["additional_special_tokens"] = [new_token_1]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_1],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
),
)
NEW_TOKEN = 0xE007
new_token_2 = chr(NEW_TOKEN)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
)
self.assertIn(new_token_2, tokenizer.additional_special_tokens)
# self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
)
@require_tokenizers
def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
......
......@@ -16,6 +16,7 @@
import inspect
import itertools
import json
import os
import pickle
import re
......@@ -3161,6 +3162,64 @@ class TokenizerTesterMixin:
self.assertTrue(special_token_id in p_output)
self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"]
tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
self.assertIn("an_additional_special_token", tokenizer_without_change_in_init.get_vocab())
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
......
......@@ -12,7 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import tempfile
import unittest
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
......@@ -294,6 +296,71 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(100)]
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
@slow
def test_tokenizer_integration(self):
# fmt: off
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment