Fix 9918 (#9932)

* Initial work * Fix doc styler and other models

Fix 9918 (#9932)
* Initial work * Fix doc styler and other models
de38a6e4 · Sylvain Gugger · GitHub · 1809de51 · de38a6e4 · de38a6e4
Unverified Commit de38a6e4 authored Feb 02, 2021 by Sylvain Gugger Committed by GitHub Feb 02, 2021
7 changed files
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -56,6 +56,8 @@ PreTrainedTokenizer
    :special-members: __call__
    :members:

+    .. automethod:: encode
+

 PreTrainedTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -64,6 +66,8 @@ PreTrainedTokenizerFast
    :special-members: __call__
    :members:

+    .. automethod:: encode
+

 BatchEncoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -364,28 +364,35 @@ DPR_ENCODERS_INPUTS_DOCSTRING = r"""

            Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`,
-            `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0,
-            1]``:
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

-            `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of
-            shape :obj:`(batch_size, sequence_length)`, `optional`): Segment token indices to indicate first and second
-            portions of the inputs. Indices are selected in ``[0, 1]``:
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:

            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.

-            `What are token type IDs? <../glossary.html#token-type-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of
-            shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Optionally, instead of passing
-            :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want
-            more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal
-            embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the
-            attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail.
-            output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers.
-            See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`):
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
 """

@@ -403,6 +410,8 @@ DPR_READER_INPUTS_DOCSTRING = r"""

            Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
            more details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:


--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -486,15 +486,17 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""

            (a) For sequence pairs (for a pair title+text for example):

-                ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+            ::

-                ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
+                tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+                token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1

            (b) For single sequences (for a question for example):

-                ``tokens: [CLS] the dog is hairy . [SEP]``
+            ::

-                ``token_type_ids: 0 0 0 0 0 0 0``
+                tokens:         [CLS] the dog is hairy . [SEP]
+                token_type_ids:   0   0   0   0  0     0   0

            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
            rather than the left.
@@ -502,6 +504,8 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:


--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -412,6 +412,8 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize
            the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that
            tokenizer class to obtain the indices.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:


--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1041,6 +1041,8 @@ T5_INPUTS_DOCSTRING = r"""
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            detail.

+            `What are input IDs? <../glossary.html#input-ids>`__
+
            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
            <./t5.html#training>`__.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -929,7 +929,7 @@ T5_START_DOCSTRING = r"""

 T5_INPUTS_DOCSTRING = r"""
    Args:
-        inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on the right or the left.

@@ -937,6 +937,8 @@ T5_INPUTS_DOCSTRING = r"""
            :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
            details.

+            `What are input IDs? <../glossary.html#input-ids>`__
+
            To know more on how to prepare :obj:`inputs` for pretraining take a look at `T5 Training
            <./t5.html#training>`__.
        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):

--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@@ -135,6 +135,14 @@ class CodeStyler:
        """
        return SpecialBlock.NOT_SPECIAL

+    def end_of_special_style(self, line):
+        """
+        Sets back the `in_block` attribute to `NOT_SPECIAL`.
+
+        Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
+        """
+        self.in_block = SpecialBlock.NOT_SPECIAL
+
    def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
        """
        Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
@@ -220,6 +228,7 @@ class CodeStyler:
        new_lines = []
        paragraph = []
        self.current_indent = ""
+        self.previous_indent = None
        # If one of those is True, the paragraph should not be touched (code samples, lists...)
        no_style = False
        no_style_next = False
@@ -251,7 +260,7 @@ class CodeStyler:
                                self.current_indent = indent
                        elif not indent.startswith(self.current_indent):
                            # If not, we are leaving the block when we unindent.
-                            self.in_block = SpecialBlock.NOT_SPECIAL
+                            self.end_of_special_style(paragraph[0])

                    if self.is_special_block(paragraph[0]):
                        # Maybe we are starting a special block.
@@ -326,6 +335,8 @@ class DocstringStyler(CodeStyler):

    def is_special_block(self, line):
        if self.is_no_style_block(line):
+            if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
+                self.previous_indent = self.current_indent
            self.in_block = SpecialBlock.NO_STYLE
            return True
        if _re_arg_def.search(line) is not None:
@@ -333,6 +344,14 @@ class DocstringStyler(CodeStyler):
            return True
        return False

+    def end_of_special_style(self, line):
+        if self.previous_indent is not None and line.startswith(self.previous_indent):
+            self.in_block = SpecialBlock.ARG_LIST
+            self.current_indent = self.previous_indent
+        else:
+            self.in_block = SpecialBlock.NOT_SPECIAL
+            self.previous_indent = None
+
    def init_in_block(self, text):
        lines = text.split("\n")
        while len(lines) > 0 and len(lines[0]) == 0: