Unverified Commit 135791e8 authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

Add pad_to_multiple_of on tokenizers (reimport) (#5054)



* Add new parameter `pad_to_multiple_of` on tokenizers.

* unittest for pad_to_multiple_of

* Add .name when logging enum.

* Fix missing .items() on dict in tests.

* Add special check + warning if the tokenizer doesn't have proper pad_token.

* Use the correct logger format specifier.

* Ensure tokenizer with no pad_token do not modify the underlying padding strategy.

* Skip test if tokenizer doesn't have pad_token

* Fix RobertaTokenizer on empty input

* Format.
Signed-off-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>

* fix and updating to simpler API
Co-authored-by: default avatarThomas Wolf <thomwolf@users.noreply.github.com>
parent 7cc15bdd
...@@ -244,7 +244,7 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -244,7 +244,7 @@ class RobertaTokenizer(GPT2Tokenizer):
def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs): def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_pretokenized or add_prefix_space) and text: if (is_pretokenized or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text text = " " + text
return (text, kwargs) return (text, kwargs)
......
...@@ -409,6 +409,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -409,6 +409,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -461,6 +462,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -461,6 +462,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
prepend_batch_axis=True, prepend_batch_axis=True,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
...@@ -487,6 +489,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -487,6 +489,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -541,6 +544,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -541,6 +544,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens, return_overflowing_tokens=return_overflowing_tokens,
...@@ -561,6 +565,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -561,6 +565,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None, return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -587,6 +592,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -587,6 +592,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens, return_overflowing_tokens=return_overflowing_tokens,
...@@ -606,6 +612,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -606,6 +612,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
batch_outputs, batch_outputs,
padding=padding_strategy.value, padding=padding_strategy.value,
max_length=max_length, max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
) )
...@@ -623,6 +630,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -623,6 +630,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None, return_tensors: Optional[str] = None,
prepend_batch_axis: bool = False, prepend_batch_axis: bool = False,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
...@@ -654,8 +662,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -654,8 +662,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
encoded_inputs = {} encoded_inputs = {}
# Truncation: Handle max sequence length # Compute the total size of the returned encodings
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
# Truncation: Handle max sequence length
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids, overflowing_tokens = self.truncate_sequences(
ids, ids,
...@@ -700,6 +710,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -700,6 +710,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
encoded_inputs, encoded_inputs,
max_length=max_length, max_length=max_length,
padding=padding_strategy.value, padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
) )
......
...@@ -960,6 +960,9 @@ ENCODE_KWARGS_DOCSTRING = r""" ...@@ -960,6 +960,9 @@ ENCODE_KWARGS_DOCSTRING = r"""
The value of this argument defines the number of overlapping tokens. The value of this argument defines the number of overlapping tokens.
is_pretokenized (:obj:`bool`, defaults to :obj:`False`): is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Set to True to indicate the input is already tokenized Set to True to indicate the input is already tokenized
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`,
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers.
...@@ -1427,7 +1430,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1427,7 +1430,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
raise NotImplementedError raise NotImplementedError
def _get_padding_truncation_strategies( def _get_padding_truncation_strategies(
self, padding=False, truncation=False, max_length=None, verbose=True, **kwargs self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
): ):
""" Find the correct padding/truncation strategy with backward compatibility """ Find the correct padding/truncation strategy with backward compatibility
for old arguments (truncation_strategy and pad_to_max_length) and behaviors. for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
...@@ -1527,6 +1530,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1527,6 +1530,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
) )
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
if (
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
and padding_strategy != PaddingStrategy.DO_NOT_PAD
and pad_to_multiple_of is not None
and max_length is not None
and (max_length % pad_to_multiple_of != 0)
):
raise ValueError(
f"Truncation and padding are both activated but "
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
)
return padding_strategy, truncation_strategy, max_length, kwargs return padding_strategy, truncation_strategy, max_length, kwargs
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
...@@ -1540,6 +1556,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1540,6 +1556,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -1581,6 +1598,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1581,6 +1598,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
is_pretokenized=is_pretokenized, is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
...@@ -1601,6 +1619,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1601,6 +1619,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
is_pretokenized=is_pretokenized, is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
...@@ -1623,6 +1642,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1623,6 +1642,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -1650,7 +1670,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1650,7 +1670,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding, truncation, max_length, verbose, **kwargs padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
) )
return self._encode_plus( return self._encode_plus(
...@@ -1662,6 +1687,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1662,6 +1687,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
is_pretokenized=is_pretokenized, is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
...@@ -1683,6 +1709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1683,6 +1709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -1712,6 +1739,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1712,6 +1739,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -1738,7 +1766,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1738,7 +1766,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding, truncation, max_length, verbose, **kwargs padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
) )
return self._batch_encode_plus( return self._batch_encode_plus(
...@@ -1749,6 +1782,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1749,6 +1782,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
is_pretokenized=is_pretokenized, is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
...@@ -1776,6 +1810,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1776,6 +1810,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -1799,6 +1834,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1799,6 +1834,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
], ],
padding: Union[bool, str] = True, padding: Union[bool, str] = True,
max_length: Optional[int] = None, max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True, verbose: bool = True,
...@@ -1820,6 +1856,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1820,6 +1856,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
- 'do_not_pad' (or `False`): Do not pad - 'do_not_pad' (or `False`): Do not pad
max_length: maximum length of the returned list and optionally padding length (see below). max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens. Will truncate by taking into account the special tokens.
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`,
...@@ -1842,7 +1881,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1842,7 +1881,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs["attention_mask"] = [] encoded_inputs["attention_mask"] = []
return encoded_inputs return encoded_inputs
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' # Convert padding_strategy in PaddingStrategy
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
padding=padding, max_length=max_length, verbose=verbose padding=padding, max_length=max_length, verbose=verbose
) )
...@@ -1852,6 +1891,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1852,6 +1891,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs, encoded_inputs,
max_length=max_length, max_length=max_length,
padding_strategy=padding_strategy, padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
) )
return BatchEncoding(encoded_inputs, tensor_type=return_tensors) return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
...@@ -1872,6 +1912,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1872,6 +1912,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
inputs, inputs,
max_length=max_length, max_length=max_length,
padding_strategy=padding_strategy, padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
) )
...@@ -1887,6 +1928,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1887,6 +1928,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None, max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
) -> dict: ) -> dict:
""" Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
...@@ -1902,6 +1944,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1902,6 +1944,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
The tokenizer padding sides are defined in self.padding_side: The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences - 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences - 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
""" """
# Load from model defaults # Load from model defaults
...@@ -1911,6 +1956,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1911,6 +1956,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if padding_strategy == PaddingStrategy.LONGEST: if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(encoded_inputs["input_ids"]) max_length = len(encoded_inputs["input_ids"])
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = ( needs_to_be_padded = (
padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length
) )
......
...@@ -241,25 +241,26 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -241,25 +241,26 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens
def set_truncation_and_padding( def set_truncation_and_padding(
self, padding_strategy: PaddingStrategy, truncation_strategy: TruncationStrategy, max_length: int, stride: int, self,
padding_strategy: PaddingStrategy,
truncation_strategy: TruncationStrategy,
max_length: int,
stride: int,
pad_to_multiple_of: Optional[int],
): ):
""" This contextmanager is in charge of defining the truncation and the padding strategies for fast tokenizers """ Define the truncation and the padding strategies for fast tokenizers
(provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards. (provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards.
This contextmanager assumes the provider tokenizer has no padding / truncation strategy The provided tokenizer has no padding / truncation strategy
before the managed section. If your tokenizer set a padding / truncation strategy before, before the managed section. If your tokenizer set a padding / truncation strategy before,
then it will be reset to no padding/truncation when exiting the managed section. then it will be reset to no padding/truncation when exiting the managed section.
Args: Args:
tokenizer (BaseTokenizerFast): The tokenizer which will be used padding_strategy (:obj:`PaddingStrategy`): The kind of padding that will be applied to the input
max_length (int): The maximum size of the sequence truncation_strategy (:obj:`TruncationStrategy`): The kind of truncation that will be applied to the input
stride (int): The stride to use when handling overflow max_length (:obj:`int`): The maximum size of the sequence
strategy (str): Overflowing logic to use stride (:obj:`int`): The stride to use when handling overflow
pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length pad_to_multiple_of (:obj:`int`, `optional`, defaults to `None`)
padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
pad_token_id (int): The integer representation of the padding token to use
pad_token_type_id (int): The integer representation of the padding token type to use
pad_token (str): The string representation of the padding token to use
""" """
# Set truncation and padding on the backend tokenizer # Set truncation and padding on the backend tokenizer
...@@ -275,6 +276,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -275,6 +276,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
pad_id=self.pad_token_id, pad_id=self.pad_token_id,
pad_type_id=self.pad_token_type_id, pad_type_id=self.pad_token_type_id,
pad_token=self.pad_token, pad_token=self.pad_token,
pad_to_multiple_of=pad_to_multiple_of,
) )
else: else:
self._tokenizer.no_padding() self._tokenizer.no_padding()
...@@ -290,6 +292,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -290,6 +292,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None, return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -315,6 +318,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -315,6 +318,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
) )
# Avoid thread overhead if only one example. # Avoid thread overhead if only one example.
...@@ -383,6 +387,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -383,6 +387,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
max_length: Optional[int] = None, max_length: Optional[int] = None,
stride: int = 0, stride: int = 0,
is_pretokenized: bool = False, is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None, return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None, return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
...@@ -403,6 +408,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -403,6 +408,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
max_length=max_length, max_length=max_length,
stride=stride, stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors, return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask, return_attention_mask=return_attention_mask,
......
...@@ -883,6 +883,40 @@ class TokenizerTesterMixin: ...@@ -883,6 +883,40 @@ class TokenizerTesterMixin:
assert sequence_length == padded_sequence_right_length assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right assert encoded_sequence == padded_sequence_right
def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
else:
with self.subTest(f"{tokenizer.__class__.__name__}"):
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
for key, value in empty_tokens.items():
self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
normal_tokens = tokenizer("This", pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertNotEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
# Should also work with truncation
normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
# truncation to something which is not a multiple of pad_to_multiple_of raises an error
self.assertRaises(
ValueError,
tokenizer.__call__,
"This",
padding=True,
truncation=True,
max_length=12,
pad_to_multiple_of=8,
)
def test_encode_plus_with_padding(self): def test_encode_plus_with_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment