Unverified Commit 7223844d authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

Change how "additional_special_tokens" argument in the ".from_pretrained"...

Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)

* add test

* add change in PretrainedTokenizerBase

* change Luke

* deactivate

* add the possibility to add additional special tokens for M2M100

* format

* add special test for canine

* proposed changes for mbart

* proposed changes for mbart50

* proposed changes for byt5

* proposed changes for canine

* proposed changes for t5

* test fast and slow

* remove comment

* remove comment

* add fast version for all tests

* replace break by continue

* add more comments

* add check to avoid duplicates

* remove comment

* format

* proposed change for wave2vec2

* reverse changes mbart

* uncomment

* format
parent b13c6c18
...@@ -210,8 +210,8 @@ class LukeTokenizer(RobertaTokenizer): ...@@ -210,8 +210,8 @@ class LukeTokenizer(RobertaTokenizer):
if isinstance(entity_token_2, str) if isinstance(entity_token_2, str)
else entity_token_2 else entity_token_2
) )
kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2] kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += kwargs.get("additional_special_tokens", []) kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
super().__init__( super().__init__(
vocab_file=vocab_file, vocab_file=vocab_file,
......
...@@ -137,6 +137,15 @@ class M2M100Tokenizer(PreTrainedTokenizer): ...@@ -137,6 +137,15 @@ class M2M100Tokenizer(PreTrainedTokenizer):
) -> None: ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
self.get_lang_token(lang_code)
for lang_code in FAIRSEQ_LANGUAGE_CODES
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
]
super().__init__( super().__init__(
src_lang=src_lang, src_lang=src_lang,
tgt_lang=tgt_lang, tgt_lang=tgt_lang,
...@@ -157,14 +166,11 @@ class M2M100Tokenizer(PreTrainedTokenizer): ...@@ -157,14 +166,11 @@ class M2M100Tokenizer(PreTrainedTokenizer):
self.encoder_size = len(self.encoder) self.encoder_size = len(self.encoder)
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
self.lang_token_to_id = { self.lang_token_to_id = {
self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES) self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)
} }
self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)} self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)}
self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()} self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
self._additional_special_tokens = list(self.lang_token_to_id.keys())
self._src_lang = src_lang if src_lang is not None else "en" self._src_lang = src_lang if src_lang is not None else "en"
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
......
...@@ -130,6 +130,11 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -130,6 +130,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
super().__init__( super().__init__(
src_lang=src_lang, src_lang=src_lang,
tgt_lang=tgt_lang, tgt_lang=tgt_lang,
...@@ -168,7 +173,6 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -168,7 +173,6 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
self._additional_special_tokens = list(self.lang_code_to_id.keys())
self._src_lang = src_lang if src_lang is not None else "en_XX" self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
......
...@@ -125,6 +125,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): ...@@ -125,6 +125,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
super().__init__( super().__init__(
vocab_file, vocab_file,
src_lang=src_lang, src_lang=src_lang,
...@@ -141,7 +146,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): ...@@ -141,7 +146,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
self.lang_code_to_id = { self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
} }
......
...@@ -1862,6 +1862,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1862,6 +1862,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle) special_tokens_map = json.load(special_tokens_map_handle)
for key, value in special_tokens_map.items(): for key, value in special_tokens_map.items():
if key in kwargs and kwargs[key]:
# This value has already been redefined by the kwargs
# We keep this new value and ignore the one stored in the special_tokens_map_file
continue
if isinstance(value, dict): if isinstance(value, dict):
value = AddedToken(**value) value = AddedToken(**value)
elif isinstance(value, list): elif isinstance(value, list):
......
...@@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase): ...@@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n") fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs_init):
kwargs.update(self.add_kwargs_tokens_map) kwargs = self.add_kwargs_tokens_map.copy()
kwargs.update(kwargs_init)
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs): def get_feature_extractor(self, **kwargs):
......
...@@ -13,11 +13,13 @@ ...@@ -13,11 +13,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json
import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from transformers import BatchEncoding, ByT5Tokenizer from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
from transformers.file_utils import cached_property, is_tf_available, is_torch_available from transformers.file_utils import cached_property, is_tf_available, is_torch_available
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -161,6 +163,72 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -161,6 +163,72 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
shutil.rmtree(tmpdirname) shutil.rmtree(tmpdirname)
# There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens
# We need to add the extra_ids in the list of the arg additional_special_tokens
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(125)]
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
pass pass
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json
import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
...@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.save_pretrained(tmp_dir_name) tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name) tokenizer.from_pretrained(tmp_dir_name)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
# a special token for Canine can be defined as follows:
NEW_TOKEN = 0xE006
new_token_1 = chr(NEW_TOKEN)
special_tokens_map["additional_special_tokens"] = [new_token_1]
tokenizer_config["additional_special_tokens"] = [new_token_1]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_1],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
),
)
NEW_TOKEN = 0xE007
new_token_2 = chr(NEW_TOKEN)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
)
self.assertIn(new_token_2, tokenizer.additional_special_tokens)
# self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
)
@require_tokenizers @require_tokenizers
def test_encode_decode_with_spaces(self): def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import inspect import inspect
import itertools import itertools
import json
import os import os
import pickle import pickle
import re import re
...@@ -3161,6 +3162,64 @@ class TokenizerTesterMixin: ...@@ -3161,6 +3162,64 @@ class TokenizerTesterMixin:
self.assertTrue(special_token_id in p_output) self.assertTrue(special_token_id in p_output)
self.assertTrue(special_token_id in cr_output) self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"]
tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
self.assertIn("an_additional_special_token", tokenizer_without_change_in_init.get_vocab())
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
def test_training_new_tokenizer(self): def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers # This feature only exists for fast tokenizers
if not self.test_rust_tokenizer: if not self.test_rust_tokenizer:
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json
import os
import tempfile
import unittest import unittest
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
...@@ -294,6 +296,71 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -294,6 +296,71 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
self.assertTrue(special_token_id in cr_output) self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(100)]
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
"an_additional_special_token"
]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
tmp_dir,
)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
["an_additional_special_token"],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
),
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir,
additional_special_tokens=new_added_tokens,
)
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
self.assertEqual(
["a_new_additional_special_token"],
tokenizer.convert_ids_to_tokens(
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
),
)
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off # fmt: off
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment