Unverified Commit ef7e9369 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`Tokenizer`] Fix slow and fast serialization (#26570)

* fix

* last attempt

* current work

* fix forward compatibility

* save all special tokens

* current state

* revert additional changes

* updates

* remove tokenizer.model

* add a test and the fix

* nit

* revert one more break

* fix typefield issue

* quality

* more tests

* fix fields for FC

* more nits?

* new additional changes

* how

* some updates

* simplify all

* more nits

* revert some things to original

* nice

* nits

* a small hack

* more nits

* ahhaha

* fixup

* update

* make test run on ci

* use subtesting

* update

* Update .circleci/create_circleci_config.py

* updates

* fixup

* nits

* replace typo

* fix the test

* nits

* update

* None max dif pls

* a partial fix

* had to revert one thing

* test the fast

* updates

* fixup

* and more nits

* more fixes

* update

* Oupsy 👁



* nits

* fix marian

* on our way to heaven

* Update src/transformers/models/t5/tokenization_t5.py
Co-authored-by: default avatarLysandre Debut <hi@lysand.re>

* fixup

* Update src/transformers/tokenization_utils_fast.py
Co-authored-by: default avatarLeo Tronchon <leo.tronchon@gmail.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarLeo Tronchon <leo.tronchon@gmail.com>

* fix phobert

* skip some things, test more

* nits

* fixup

* fix deberta

* update

* update

* more updates

* skip one test

* more updates

* fix camembert

* can't test this one

* more good fixes

* kind of a major update

- seperate what is only done in fast in fast init and refactor
- add_token(AddedToken(..., speicla = True)) ignores it in fast
- better loading

* fixup

* more fixups

* fix pegasus and mpnet

* remove skipped tests

* fix phoneme tokenizer if self.verbose

* fix individual models

* update common tests

* update testing files

* all over again

* nits

* skip test for markup lm

* fixups

* fix order of addition in fast by sorting the added tokens decoder

* proper defaults for deberta

* correct default for fnet

* nits on add tokens, string initialized to special if special

* skip irrelevant herbert tests

* main fixes

* update test added_tokens_serialization

* the fix for bart like models and class instanciating

* update bart

* nit!

* update idefix test

* fix whisper!

* some fixup

* fixups

* revert some of the wrong chanegs

* fixup

* fixup

* skip marian

* skip the correct tests

* skip for tf and flax as well

---------
Co-authored-by: default avatarLysandre Debut <hi@lysand.re>
Co-authored-by: default avatarLeo Tronchon <leo.tronchon@gmail.com>
parent 34678db4
...@@ -127,6 +127,7 @@ class CircleCIJob: ...@@ -127,6 +127,7 @@ class CircleCIJob:
}, },
] ]
steps.extend([{"run": l} for l in self.install_steps]) steps.extend([{"run": l} for l in self.install_steps])
steps.extend([{"run": "pip install pytest-subtests"}])
steps.append( steps.append(
{ {
"save_cache": { "save_cache": {
......
...@@ -1168,9 +1168,9 @@ class LlamaConverter(SpmConverter): ...@@ -1168,9 +1168,9 @@ class LlamaConverter(SpmConverter):
) )
tokenizer.add_special_tokens( tokenizer.add_special_tokens(
[ [
AddedToken("<unk>"), AddedToken("<unk>", normalized=False, special=True),
AddedToken("<s>"), AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>"), AddedToken("</s>", normalized=False, special=True),
] ]
) )
else: else:
......
...@@ -204,8 +204,6 @@ class BartTokenizer(PreTrainedTokenizer): ...@@ -204,8 +204,6 @@ class BartTokenizer(PreTrainedTokenizer):
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
# Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:
......
...@@ -170,7 +170,12 @@ class BartTokenizerFast(PreTrainedTokenizerFast): ...@@ -170,7 +170,12 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
trim_offsets=True, trim_offsets=True,
**kwargs, **kwargs,
): ):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
super().__init__( super().__init__(
vocab_file, vocab_file,
merges_file, merges_file,
......
...@@ -136,8 +136,8 @@ class BarthezTokenizer(PreTrainedTokenizer): ...@@ -136,8 +136,8 @@ class BarthezTokenizer(PreTrainedTokenizer):
sp_model_kwargs: Optional[Dict[str, Any]] = None, sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
......
...@@ -149,10 +149,10 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -149,10 +149,10 @@ class BertweetTokenizer(PreTrainedTokenizer):
self.merges_file = merges_file self.merges_file = merges_file
self.encoder = {} self.encoder = {}
self.encoder[bos_token] = 0 self.encoder[str(bos_token)] = 0
self.encoder[pad_token] = 1 self.encoder[str(pad_token)] = 1
self.encoder[eos_token] = 2 self.encoder[str(eos_token)] = 2
self.encoder[unk_token] = 3 self.encoder[str(unk_token)] = 3
self.add_from_file(vocab_file) self.add_from_file(vocab_file)
......
...@@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
mask_token (`str`, *optional*, defaults to `"<mask>"`): mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED']`): additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED', '<unk>NOTUSED']`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (`dict`, *optional*): sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
...@@ -127,12 +127,16 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -127,12 +127,16 @@ class CamembertTokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"], additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
sp_model_kwargs: Optional[Dict[str, Any]] = None, sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
if isinstance(mask_token, str)
else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
...@@ -144,11 +148,11 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -144,11 +148,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
# sentencepiece vocabulary (this is the case for <s> and </s> and <unk>). # sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
# In this case it is recommended to properly set the tokens by hand. # In this case it is recommended to properly set the tokens by hand.
self._added_tokens_decoder = { self._added_tokens_decoder = {
0: AddedToken("<s>NOTUSED"), 0: AddedToken("<s>NOTUSED", special=True),
1: AddedToken(pad_token), 1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
2: AddedToken("</s>NOTUSED"), 2: AddedToken("</s>NOTUSED", special=True),
3: AddedToken(unk_token), 3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
4: AddedToken("<unk>NOTUSED"), 4: AddedToken("<unk>NOTUSED", special=True),
} }
self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4
......
...@@ -119,12 +119,11 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): ...@@ -119,12 +119,11 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"], additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
**kwargs, **kwargs,
): ):
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
super().__init__( super().__init__(
vocab_file, vocab_file,
tokenizer_file=tokenizer_file, tokenizer_file=tokenizer_file,
......
...@@ -163,10 +163,10 @@ class CodeGenTokenizer(PreTrainedTokenizer): ...@@ -163,10 +163,10 @@ class CodeGenTokenizer(PreTrainedTokenizer):
add_bos_token=False, add_bos_token=False,
**kwargs, **kwargs,
): ):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:
......
...@@ -192,12 +192,12 @@ class DebertaTokenizer(PreTrainedTokenizer): ...@@ -192,12 +192,12 @@ class DebertaTokenizer(PreTrainedTokenizer):
add_bos_token=False, add_bos_token=False,
**kwargs, **kwargs,
): ):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
......
...@@ -138,7 +138,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): ...@@ -138,7 +138,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
self._tokenizer = SPMTokenizer( self._tokenizer = SPMTokenizer(
vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
) )
unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False) unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
super().__init__( super().__init__(
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
bos_token=bos_token, bos_token=bos_token,
......
...@@ -116,9 +116,10 @@ class FNetTokenizer(PreTrainedTokenizer): ...@@ -116,9 +116,10 @@ class FNetTokenizer(PreTrainedTokenizer):
) -> None: ) -> None:
# Mask token behave like a normal word, i.e. include the space before it and # Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence. # is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
......
...@@ -20,7 +20,7 @@ import sys ...@@ -20,7 +20,7 @@ import sys
import unicodedata import unicodedata
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import ( from ...tokenization_utils_base import (
BatchEncoding, BatchEncoding,
EncodedInput, EncodedInput,
...@@ -244,6 +244,12 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): ...@@ -244,6 +244,12 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
additional_special_tokens: Optional[List[str]] = None, additional_special_tokens: Optional[List[str]] = None,
**kwargs, **kwargs,
): ):
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
......
...@@ -248,7 +248,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): ...@@ -248,7 +248,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
) -> None: ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
......
...@@ -197,8 +197,6 @@ class LEDTokenizer(PreTrainedTokenizer): ...@@ -197,8 +197,6 @@ class LEDTokenizer(PreTrainedTokenizer):
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
# Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:
......
...@@ -152,7 +152,12 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): ...@@ -152,7 +152,12 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
trim_offsets=True, trim_offsets=True,
**kwargs, **kwargs,
): ):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
super().__init__( super().__init__(
vocab_file, vocab_file,
merges_file, merges_file,
......
...@@ -155,10 +155,10 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -155,10 +155,10 @@ class LlamaTokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
): ):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
if legacy is None: if legacy is None:
logger.warning_once( logger.warning_once(
......
...@@ -148,9 +148,9 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -148,9 +148,9 @@ class MarianTokenizer(PreTrainedTokenizer):
self.separate_vocabs = separate_vocabs self.separate_vocabs = separate_vocabs
self.encoder = load_json(vocab) self.encoder = load_json(vocab)
if unk_token not in self.encoder: if str(unk_token) not in self.encoder:
raise KeyError("<unk> token must be in the vocab") raise KeyError("<unk> token must be in the vocab")
assert pad_token in self.encoder assert str(pad_token) in self.encoder
if separate_vocabs: if separate_vocabs:
self.target_encoder = load_json(target_vocab_file) self.target_encoder = load_json(target_vocab_file)
......
...@@ -97,7 +97,9 @@ class MBartTokenizer(PreTrainedTokenizer): ...@@ -97,7 +97,9 @@ class MBartTokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
): ):
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = (
AddedToken(mask_token, lstrip=True, normalized=False) if isinstance(mask_token, str) else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
......
...@@ -132,7 +132,7 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -132,7 +132,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [ kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment