Fix `LayoutXLM` docstrings (#17038)

* Fix docstrings * Fix legacy issue * up * apply suggestions * up * quality

Fix `LayoutXLM` docstrings (#17038)
* Fix docstrings * Fix legacy issue * up * apply suggestions * up * quality
0ed4d0df · Li-Huai (Allan) Lin · GitHub · 4b1ed797 · 0ed4d0df · 0ed4d0df
Unverified Commit 0ed4d0df authored Jul 20, 2022 by Li-Huai (Allan) Lin Committed by GitHub Jul 20, 2022
3 changed files
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -109,53 +109,61 @@ LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING = r"""
                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
+            return_token_type_ids (`bool`, *optional*):
-                Whether or not to encode the sequences with the special tokens relative to their model.
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
-                Activates and controls padding. Accepts the following values:
+                [What are token type IDs?](../glossary#token-type-ids)
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            return_attention_mask (`bool`, *optional*):
-                  sequence if provided).
+                Whether to return the attention mask. If left to the default, will return the attention mask according
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                [What are attention masks?](../glossary#attention-mask)
-                  lengths).
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                Activates and controls truncation. Accepts the following values:
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                Whether or not to return special tokens mask information.
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
-                  sequences (or a batch of pairs) is provided.
+                Whether or not to return `(char_start, char_end)` for each token.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                Python's tokenizer, this method will raise `NotImplementedError`.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+            return_length  (`bool`, *optional*, defaults to `False`):
-                  maximum acceptable input length for the model if that argument is not provided. This will only
+                Whether or not to return the lengths of the encoded inputs.
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            verbose (`bool`, *optional*, defaults to `True`):
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                Whether or not to print more information and warnings.
-                  greater than the model maximum admissible input size).
+            **kwargs: passed to the `self.tokenize()` method
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+        Return:
-                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
-                truncation/padding to a maximum length will be deactivated.
+            - **input_ids** -- List of token ids to be fed to a model.
-            stride (`int`, *optional*, defaults to 0):
-                If set to a number along with `max_length`, the overflowing tokens returned when
+              [What are input IDs?](../glossary#input-ids)
-                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
-                returned to provide some overlap between truncated and overflowing sequences. The value of this
+            - **bbox** -- List of bounding boxes to be fed to a model.
-                argument defines the number of overlapping tokens.
-            pad_to_multiple_of (`int`, *optional*):
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+              if *"token_type_ids"* is in `self.model_input_names`).
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+              [What are token type IDs?](../glossary#token-type-ids)
-                If set, will return tensors instead of list of python integers. Acceptable values are:
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
+              [What are attention masks?](../glossary#attention-mask)
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
 """

--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -20,11 +20,9 @@ from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
 import sentencepiece as spm
-from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
@@ -44,6 +42,110 @@ from ..xlm_roberta.tokenization_xlm_roberta import (
 logger = logging.get_logger(__name__)
+LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+              [What are input IDs?](../glossary#input-ids)
+            - **bbox** -- List of bounding boxes to be fed to a model.
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+              [What are token type IDs?](../glossary#token-type-ids)
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+              [What are attention masks?](../glossary#attention-mask)
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
 class LayoutXLMTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
@@ -339,7 +441,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
        return (out_vocab_file,)
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
@@ -543,7 +645,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
        return BatchEncoding(batch_outputs)
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def _batch_prepare_for_model(
        self,
        batch_text_or_text_pairs,
@@ -666,7 +768,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
            verbose=verbose,
        )
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        text: Union[TextInput, PreTokenizedInput],

--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -19,11 +19,10 @@ import os
 from shutil import copyfile
 from typing import Dict, List, Optional, Tuple, Union
-from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
+from transformers.models.layoutxlm.tokenization_layoutxlm import LAYOUTXLM_ENCODE_KWARGS_DOCSTRING
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
@@ -166,7 +165,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],