Unverified Commit 8d9c3836 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

Add clean_up_tokenization_spaces to config (#22341)



* add draft changes

* fix failing wav2vec

* style

* make sure that the argument is saved + add tests

* style

* fixup

* update test

* default clean_up_tokenization_spaces to False for Bloom and Llama

* Update code based on review
Co-authored-by: default avatarNicolas Patry <patry.nicolas@gmail.com>

* style

* quality

---------
Co-authored-by: default avatarNicolas Patry <patry.nicolas@gmail.com>
parent b29fd697
...@@ -204,7 +204,7 @@ class BigBirdTokenizer(PreTrainedTokenizer): ...@@ -204,7 +204,7 @@ class BigBirdTokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True, spaces_between_special_tokens: bool = True,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -237,6 +237,11 @@ class BigBirdTokenizer(PreTrainedTokenizer): ...@@ -237,6 +237,11 @@ class BigBirdTokenizer(PreTrainedTokenizer):
else: else:
text = "".join(sub_texts) text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -115,6 +115,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): ...@@ -115,6 +115,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
eos_token="</s>", eos_token="</s>",
pad_token="<pad>", pad_token="<pad>",
add_prefix_space=False, add_prefix_space=False,
clean_up_tokenization_spaces=False,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(
...@@ -126,6 +127,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): ...@@ -126,6 +127,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
eos_token=eos_token, eos_token=eos_token,
pad_token=pad_token, pad_token=pad_token,
add_prefix_space=add_prefix_space, add_prefix_space=add_prefix_space,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
......
...@@ -320,7 +320,7 @@ class CodeGenTokenizer(PreTrainedTokenizer): ...@@ -320,7 +320,7 @@ class CodeGenTokenizer(PreTrainedTokenizer):
self, self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
truncate_before_pattern: Optional[List[str]] = None, truncate_before_pattern: Optional[List[str]] = None,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -335,8 +335,9 @@ class CodeGenTokenizer(PreTrainedTokenizer): ...@@ -335,8 +335,9 @@ class CodeGenTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
A list of regular expression strings that will be used to truncate the returned string. This can be A list of regular expression strings that will be used to truncate the returned string. This can be
used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
......
...@@ -187,7 +187,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): ...@@ -187,7 +187,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
self, self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
truncate_before_pattern: Optional[List[str]] = None, truncate_before_pattern: Optional[List[str]] = None,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -202,8 +202,9 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): ...@@ -202,8 +202,9 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
A list of regular expression strings that will be used to truncate the returned string. This can be A list of regular expression strings that will be used to truncate the returned string. This can be
used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
......
...@@ -236,7 +236,7 @@ class FNetTokenizer(PreTrainedTokenizer): ...@@ -236,7 +236,7 @@ class FNetTokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True, spaces_between_special_tokens: bool = True,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -269,6 +269,11 @@ class FNetTokenizer(PreTrainedTokenizer): ...@@ -269,6 +269,11 @@ class FNetTokenizer(PreTrainedTokenizer):
else: else:
text = "".join(sub_texts) text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -59,10 +59,17 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -59,10 +59,17 @@ class LlamaTokenizer(PreTrainedTokenizer):
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
decode_with_prefix_space=False, decode_with_prefix_space=False,
clean_up_tokenization_spaces=False,
**kwargs, **kwargs,
): ):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token self.add_eos_token = add_eos_token
......
...@@ -225,8 +225,9 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -225,8 +225,9 @@ class MarianTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
use_source_tokenizer (`bool`, *optional*, defaults to `False`): use_source_tokenizer (`bool`, *optional*, defaults to `False`):
Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
problems). problems).
...@@ -250,8 +251,9 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -250,8 +251,9 @@ class MarianTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
use_source_tokenizer (`bool`, *optional*, defaults to `False`): use_source_tokenizer (`bool`, *optional*, defaults to `False`):
Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
problems). problems).
......
...@@ -373,7 +373,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -373,7 +373,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
group_tokens: bool = True, group_tokens: bool = True,
spaces_between_special_tokens: bool = False, spaces_between_special_tokens: bool = False,
output_word_offsets: Optional[bool] = False, output_word_offsets: Optional[bool] = False,
...@@ -402,6 +402,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -402,6 +402,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
text = string_output["text"] text = string_output["text"]
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
text = self.clean_up_tokenization(text) text = self.clean_up_tokenization(text)
...@@ -421,7 +426,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -421,7 +426,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
self, self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False, output_char_offsets: bool = False,
output_word_offsets: bool = False, output_word_offsets: bool = False,
**kwargs, **kwargs,
...@@ -434,7 +439,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -434,7 +439,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`): output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the Whether or not to output character offsets. Character offsets can be used in combination with the
...@@ -491,7 +496,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -491,7 +496,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
self, self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False, output_char_offsets: bool = False,
output_word_offsets: bool = False, output_word_offsets: bool = False,
**kwargs, **kwargs,
...@@ -507,7 +512,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -507,7 +512,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`): output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the Whether or not to output character offsets. Character offsets can be used in combination with the
...@@ -887,7 +892,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): ...@@ -887,7 +892,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
**kwargs, **kwargs,
) -> str: ) -> str:
""" """
...@@ -905,6 +910,11 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): ...@@ -905,6 +910,11 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
text = self.convert_tokens_to_string(result) text = self.convert_tokens_to_string(result)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -409,7 +409,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -409,7 +409,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
group_tokens: bool = True, group_tokens: bool = True,
filter_word_delimiter_token: bool = True, filter_word_delimiter_token: bool = True,
spaces_between_special_tokens: bool = False, spaces_between_special_tokens: bool = False,
...@@ -438,6 +438,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -438,6 +438,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
text = string_output["text"] text = string_output["text"]
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
text = self.clean_up_tokenization(text) text = self.clean_up_tokenization(text)
...@@ -451,7 +456,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -451,7 +456,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
self, self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False, output_char_offsets: bool = False,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -466,7 +471,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -466,7 +471,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`): output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the Whether or not to output character offsets. Character offsets can be used in combination with the
...@@ -507,7 +512,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -507,7 +512,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
self, self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False, output_char_offsets: bool = False,
**kwargs, **kwargs,
) -> List[str]: ) -> List[str]:
...@@ -519,7 +524,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -519,7 +524,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`): output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the Whether or not to output character offsets. Character offsets can be used in combination with the
......
...@@ -556,7 +556,7 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -556,7 +556,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
self, self,
token_ids, token_ids,
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_offsets: bool = False, output_offsets: bool = False,
time_precision=0.02, time_precision=0.02,
decode_with_timestamps: bool = False, decode_with_timestamps: bool = False,
...@@ -573,8 +573,9 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -573,8 +573,9 @@ class WhisperTokenizer(PreTrainedTokenizer):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
kwargs (additional keyword arguments, *optional*): kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method. Will be passed to the underlying model specific decode method.
output_offsets (`bool`, *optional*, defaults to `False`): output_offsets (`bool`, *optional*, defaults to `False`):
......
...@@ -266,7 +266,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -266,7 +266,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
self, self,
token_ids, token_ids,
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
output_offsets: bool = False, output_offsets: bool = False,
time_precision=0.02, time_precision=0.02,
decode_with_timestamps: bool = False, decode_with_timestamps: bool = False,
...@@ -283,8 +283,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -283,8 +283,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
kwargs (additional keyword arguments, *optional*): kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method. Will be passed to the underlying model specific decode method.
output_offsets (`bool`, *optional*, defaults to `False`): output_offsets (`bool`, *optional*, defaults to `False`):
......
...@@ -254,7 +254,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -254,7 +254,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True, spaces_between_special_tokens: bool = True,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -284,6 +284,11 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -284,6 +284,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
# By default, there are no spaces between special tokens # By default, there are no spaces between special tokens
text = "".join(sub_texts) text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -922,7 +922,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -922,7 +922,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
self, self,
token_ids: List[int], token_ids: List[int],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True, spaces_between_special_tokens: bool = True,
**kwargs, **kwargs,
) -> str: ) -> str:
...@@ -953,6 +953,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -953,6 +953,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
else: else:
text = "".join(sub_texts) text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -1470,6 +1470,9 @@ INIT_TOKENIZER_DOCSTRING = r""" ...@@ -1470,6 +1470,9 @@ INIT_TOKENIZER_DOCSTRING = r"""
A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
tokenization process. Will be associated to `self.additional_special_tokens` and tokenization process. Will be associated to `self.additional_special_tokens` and
`self.additional_special_tokens_ids`. `self.additional_special_tokens_ids`.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
tokenization process.
""" """
...@@ -1521,6 +1524,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1521,6 +1524,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
# By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
self.deprecation_warnings = ( self.deprecation_warnings = (
{} {}
) # Use to store when we have already noticed a deprecation warning (avoid overlogging). ) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
...@@ -1576,7 +1582,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1576,7 +1582,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
f" special_tokens={self.special_tokens_map_extended})" f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})"
) )
def __len__(self) -> int: def __len__(self) -> int:
...@@ -2112,7 +2118,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2112,7 +2118,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
# target_keys = self.init_kwargs.keys() # target_keys = self.init_kwargs.keys()
target_keys = ["model_max_length"] target_keys = ["model_max_length", "clean_up_tokenization_spaces"]
for k in target_keys: for k in target_keys:
if hasattr(self, k): if hasattr(self, k):
tokenizer_config[k] = getattr(self, k) tokenizer_config[k] = getattr(self, k)
...@@ -3416,7 +3422,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3416,7 +3422,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self, self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
**kwargs, **kwargs,
) -> List[str]: ) -> List[str]:
""" """
...@@ -3427,8 +3433,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3427,8 +3433,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*): kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method. Will be passed to the underlying model specific decode method.
...@@ -3449,7 +3456,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3449,7 +3456,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self, self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
**kwargs, **kwargs,
) -> str: ) -> str:
""" """
...@@ -3463,8 +3470,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3463,8 +3470,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
List of tokenized input ids. Can be obtained using the `__call__` method. List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`): skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*): kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method. Will be passed to the underlying model specific decode method.
...@@ -3485,7 +3493,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3485,7 +3493,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self, self,
token_ids: Union[int, List[int]], token_ids: Union[int, List[int]],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
**kwargs, **kwargs,
) -> str: ) -> str:
raise NotImplementedError raise NotImplementedError
......
...@@ -539,7 +539,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -539,7 +539,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
self, self,
token_ids: Union[int, List[int]], token_ids: Union[int, List[int]],
skip_special_tokens: bool = False, skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True, clean_up_tokenization_spaces: bool = None,
**kwargs, **kwargs,
) -> str: ) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
...@@ -548,6 +548,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -548,6 +548,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
token_ids = [token_ids] token_ids = [token_ids]
text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
return clean_text return clean_text
......
...@@ -3895,6 +3895,51 @@ class TokenizerTesterMixin: ...@@ -3895,6 +3895,51 @@ class TokenizerTesterMixin:
# Should not raise an error # Should not raise an error
self.rust_tokenizer_class.from_pretrained(tmp_dir_2) self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
def test_clean_up_tokenization_spaces(self):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
assert tokenizer.clean_up_tokenization_spaces is True
tokens = tokenizer.encode("This shouldn't be! He'll go.")
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
tokenizer.clean_up_tokenization_spaces = False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
# Fast from slow
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer.save_pretrained(tmp_dir_2)
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
del tokenizer
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer_fast.decode(tokens)
# fast and slow don't have the same output when we don't cleanup
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
tokenizer_fast.clean_up_tokenization_spaces = True
assert tokenizer_fast.clean_up_tokenization_spaces is True
decoded = tokenizer_fast.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
# Slow from fast
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer_fast.clean_up_tokenization_spaces = False
tokenizer_fast.save_pretrained(tmp_dir_2)
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
tokenizer.clean_up_tokenization_spaces = True
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
class TokenizerUtilTester(unittest.TestCase): class TokenizerUtilTester(unittest.TestCase):
def test_cached_files_are_used_when_internet_is_down(self): def test_cached_files_are_used_when_internet_is_down(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment