[DOC] fix doc examples for bart-like models (#15093)

* fix doc examples * remove double colons

[DOC] fix doc examples for bart-like models (#15093)
* fix doc examples * remove double colons
3e9fdcf0 · Suraj Patil · GitHub · 61d18ae0 · 3e9fdcf0 · 3e9fdcf0
Unverified Commit 3e9fdcf0 authored Jan 10, 2022 by Suraj Patil Committed by GitHub Jan 10, 2022
16 changed files
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -534,33 +534,40 @@ BART_START_DOCSTRING = r"""
 """
 BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+    ```python
+    >>> from transformers import BartTokenizer, BartForConditionalGeneration
+    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
+    >>> # Generate Summary
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    Mask filling example:
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
-    Mask filling example::
+    ```python
+    >>> from transformers import BartTokenizer, BartForConditionalGeneration
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration >>> tokenizer =
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
-        carbs."
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
+    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
-        tokenizer([TXT], return_tensors='pt')['input_ids'] >>> logits = model(input_ids).logits
+    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+    >>> logits = model(input_ids).logits
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs = logits[0,
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-        masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5)
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
    >>> tokenizer.decode(predictions).split()
+    ```
 """
 BART_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1506,32 +1506,40 @@ class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel):
 FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING = """
    Returns:
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
+    >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
+    >>> # Generate Summary
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='jax')
+    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
+    Mask filling example:
-        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-    Mask filling example::
+    ```python
+    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration >>> tokenizer =
+    >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large")
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        carbs."
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
-        tokenizer([TXT], return_tensors='jax')['input_ids'] >>> logits = model(input_ids).logits
+    >>> input_ids = tokenizer([TXT], return_tensors="jax")["input_ids"]
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() >>> probs =
+    >>> logits = model(input_ids).logits
-        jax.nn.softmax(logits[0, masked_index], axis=0) >>> values, predictions = jax.lax.top_k(probs)
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item()
+    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = jax.lax.top_k(probs)
    >>> tokenizer.decode(predictions).split()
+    ```
 """
 overwrite_call_docstring(

--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -510,29 +510,36 @@ BART_START_DOCSTRING = r"""
 BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration, BartConfig
+    ```python
+    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> tokenizer =
+    >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
-        BartTokenizer.from_pretrained('facebook/bart-large')
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    >>> # Generate Summary
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-    Mask filling example::
+    Mask filling example:
-        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration >>> tokenizer =
+    ```python
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
+    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
-        carbs."
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        tokenizer([TXT], return_tensors='tf')['input_ids'] >>> logits = model(input_ids).logits >>> probs =
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
-        tf.nn.softmax(logits[0]) >>> # probs[5] is associated with the mask token
+    >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
+    >>> input_ids = tokenizer([TXT], return_tensors="tf")["input_ids"]
+    >>> logits = model(input_ids).logits
+    >>> probs = tf.nn.softmax(logits[0])
+    >>> # probs[5] is associated with the mask token
+    ```
 """

--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1619,19 +1619,21 @@ BIGBIRD_PEGASUS_START_DOCSTRING = r"""
 """
 BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig
+    ```python
+    >>> from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration
-        >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-arxiv') >>>
+    >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
-        tokenizer = PegasusTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
+    >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors='pt', truncation=True)
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True)
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    >>> # Generate Summary
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -1482,7 +1482,7 @@ class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedM
 FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING = """
    Returns:
-    Summarization example::
+    Summarization example:
        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
@@ -1495,7 +1495,7 @@ FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING = """
        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-    Mask filling example::
+    Mask filling example:
        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration >>>
        tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M') >>> TXT = "My friends are

--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -199,16 +199,19 @@ FSMT_START_DOCSTRING = r"""
 FSMT_GENERATION_EXAMPLE = r"""
    Translation example::
-        from transformers import FSMTTokenizer, FSMTForConditionalGeneration
+    ```python
+    >>> from transformers import FSMTTokenizer, FSMTForConditionalGeneration
-        mname = "facebook/wmt19-ru-en" model = FSMTForConditionalGeneration.from_pretrained(mname) tokenizer =
-        FSMTTokenizer.from_pretrained(mname)
+    >>> mname = "facebook/wmt19-ru-en"
+    >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
-        src_text = "Машинное обучение - это здорово, не так ли?" input_ids = tokenizer.encode(src_text,
+    >>> tokenizer = FSMTTokenizer.from_pretrained(mname)
-        return_tensors='pt') outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3) for i, output in
-        enumerate(outputs):
+    >>> src_text = "Машинное обучение - это здорово, не так ли?"
-            decoded = tokenizer.decode(output, skip_special_tokens=True) print(f"{i}: {decoded})
+    >>> input_ids = tokenizer(src_text, return_tensors="pt")
-         # 1: Machine learning is great, isn't it? ...
+    >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
+    >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+    "Machine learning is great, isn't it?"
+    ```
 """

--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -1454,36 +1454,41 @@ LED_START_DOCSTRING = r"""
 """
 LED_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> import torch >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
+    ```python
+    >>> import torch
-        >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-large-16384-arxiv') >>> tokenizer =
+    >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
-        LEDTokenizer.from_pretrained('allenai/led-large-16384-arxiv')
+    >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
-        >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art ... results in
+    >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
-        a wide range of natural language tasks including generative ... language modeling (Dai et al., 2019; Radford et
-        al., 2019) and discriminative ... language understanding (Devlin et al., 2019). This success is partly due to
+    >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
-        ... the self-attention component which enables the network to capture contextual ... information from the
+    ...     results in a wide range of natural language tasks including generative language modeling
-        entire sequence. While powerful, the memory and computational ... requirements of self-attention grow
+    ...     (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
-        quadratically with sequence length, making ... it infeasible (or very expensive) to process long sequences. ...
+    ...     This success is partly due to the self-attention component which enables the network to capture contextual
-        ... To address this limitation, we present Longformer, a modified Transformer ... architecture with a
+    ...     information from the entire sequence. While powerful, the memory and computational requirements of
-        self-attention operation that scales linearly with the ... sequence length, making it versatile for processing
+    ...     self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
-        long documents (Fig 1). This ... is an advantage for natural language tasks such as long document
+    ...     process long sequences. To address this limitation, we present Longformer, a modified Transformer
-        classification, ... question answering (QA), and coreference resolution, where existing approaches ...
+    ...     architecture with a self-attention operation that scales linearly with the sequence length, making it
-        partition or shorten the long context into smaller sequences that fall within the ... typical 512 token limit
+    ...     versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
-        of BERT-style pretrained models. Such partitioning could ... potentially result in loss of important
+    ...     long document classification, question answering (QA), and coreference resolution, where existing approaches
-        cross-partition information, and to ... mitigate this problem, existing methods often rely on complex
+    ...     partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
-        architectures to ... address such interactions. On the other hand, our proposed Longformer is able to ... build
+    ...     of BERT-style pretrained models. Such partitioning could potentially result in loss of important
-        contextual representations of the entire context using multiple layers of ... attention, reducing the need for
+    ...     cross-partition information, and to mitigate this problem, existing methods often rely on complex
-        task-specific architectures.''' >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt')
+    ...     architectures to address such interactions. On the other hand, our proposed Longformer is able to build
+    ...     contextual representations of the entire context using multiple layers of attention, reducing the need for
-        >>> # Global attention on the first token (cf. Beltagy et al. 2020) >>> global_attention_mask =
+    ...     task-specific architectures.'''
-        torch.zeros_like(inputs) >>> global_attention_mask[:, 0] = 1
+    >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask,
+    >>> # Global attention on the first token (cf. Beltagy et al. 2020)
-        ... num_beams=3, max_length=32, early_stopping=True) >>> print(tokenizer.decode(summary_ids[0],
+    >>> global_attention_mask = torch.zeros_like(inputs)
-        skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> global_attention_mask[:, 0] = 1
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
+    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    ```
 """
 LED_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -566,17 +566,19 @@ M2M_100_START_DOCSTRING = r"""
 M2M_100_GENERATION_EXAMPLE = r"""
    Translation example::
+    ```python
    >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
-        >>> model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M') >>> tokenizer =
+    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-        M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
+    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-        >>> text_to_translate = "Life is like a box of chocolates" >>> model_inputs = tokenizer(text_to_translate,
+    >>> text_to_translate = "Life is like a box of chocolates"
-        return_tensors='pt')
+    >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
-        >>> # translate to French >>> gen_tokens = model.generate( **model_inputs,
+    >>> # translate to French
-        forced_bos_token_id=tokenizer.get_lang_id("fr")) >>> print(tokenizer.batch_decode(gen_tokens,
+    >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
-        skip_special_tokens=True))
+    >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+    ```
 """
 M2M_100_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1530,34 +1530,41 @@ class FlaxMBartForConditionalGeneration(FlaxMBartPreTrainedModel):
 FLAX_MBART_CONDITIONAL_GENERATION_DOCSTRING = r"""
    Returns:
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration, MBartConfig
-        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> tokenizer =
+    >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
-        >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen." >>> inputs =
+    >>> # Generate Summary
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    Mask filling example:
-        early_stopping=True).sequences >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
-    Mask filling example::
+    ```python
+    >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
-        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration >>> tokenizer =
+    >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') >>> # de_DE is the language symbol id <LID> for
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        German >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> input_ids =
+    >>> # de_DE is the language symbol id <LID> for German
-        tokenizer([TXT], add_special_tokens=False, return_tensors='np')['input_ids'] >>> logits =
+    >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-        model(input_ids).logits
+    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="np")["input_ids"]
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() >>> probs = logits[0,
+    >>> logits = model(input_ids).logits
-        masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5)
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
    >>> tokenizer.decode(predictions).split()
+    ```
 """
 overwrite_call_docstring(

--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -532,34 +532,42 @@ MBART_START_DOCSTRING = r"""
 """
 MBART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> from transformers import MBartTokenizer, MBartForConditionalGeneration, MBartConfig
+    ```python
+    >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
+    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> tokenizer =
+    >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
-        >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen." >>> inputs =
+    >>> # Generate Summary
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    Mask filling example:
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    ```python
+    >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
-    Mask filling example::
+    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        >>> from transformers import MBartTokenizer, MBartForConditionalGeneration >>> tokenizer =
+    >>> # de_DE is the language symbol id <LID> for German
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') >>> # de_DE is the language symbol id <LID> for
+    >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-        German >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> input_ids =
+    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"]
-        tokenizer([TXT], add_special_tokens=False, return_tensors='pt')['input_ids'] >>> logits =
+    >>> logits = model(input_ids).logits
-        model(input_ids).logits
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs = logits[0,
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-        masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5)
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
    >>> tokenizer.decode(predictions).split()
+    ```
 """
 MBART_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -591,29 +591,38 @@ MBART_INPUTS_DOCSTRING = r"""
 """
 MBART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import MBartTokenizer, TFMBartForConditionalGeneration, MBartConfig
-        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> tokenizer =
+    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen." >>> inputs =
+    >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
+    >>> # Generate Summary
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-    Mask filling example::
+    Mask filling example:
-        >>> from transformers import MBartTokenizer, TFMBartForConditionalGeneration >>> tokenizer =
+    ```python
-        MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') >>> # de_DE is the language symbol id <LID> for
+    >>> from transformers import MBartTokenizer, TFMBartForConditionalGeneration
-        German >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') >>> input_ids =
+    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        tokenizer([TXT], add_special_tokens=False, return_tensors='tf')['input_ids'] >>> logits =
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        model(input_ids).logits >>> probs = tf.nn.softmax(logits[0]) >>> # probs[5] is associated with the mask token
+    >>> # de_DE is the language symbol id <LID> for German
+    >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
+    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="tf")["input_ids"]
+    >>> logits = model(input_ids).logits
+    >>> probs = tf.nn.softmax(logits[0])
+    >>> # probs[5] is associated with the mask token
+    ```
 """

--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -1480,7 +1480,7 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
 FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
    Returns:
-    Summarization example::
+    Summarization example:
        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
@@ -1493,7 +1493,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-    Mask filling example::
+    Mask filling example:
        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration >>> tokenizer =
        PegasusTokenizer.from_pretrained('google/pegasus-large') >>> TXT = "My friends are <mask> but they eat too many

--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -512,20 +512,25 @@ PEGASUS_START_DOCSTRING = r"""
 """
 PEGASUS_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
-        >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') >>> tokenizer =
+    >>> model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
-        PegasusTokenizer.from_pretrained('google/pegasus-xsum')
+    >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
-        >>> ARTICLE_TO_SUMMARIZE = ( ... "PG&E stated it scheduled the blackouts in response to forecasts for high
+    >>> ARTICLE_TO_SUMMARIZE = (
-        winds " ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers
+    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
-        were " ... "scheduled to be affected by the shutoffs which were expected to last through at least midday
+    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
-        tomorrow." ... ) >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+    ... )
+    >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt")
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']) >>> print([tokenizer.decode(g,
+    >>> # Generate Summary
-        skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> summary_ids = model.generate(inputs["input_ids"])
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 PEGASUS_INPUTS_DOCSTRING = r"""

--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -555,20 +555,25 @@ PEGASUS_START_DOCSTRING = r"""
 """
 PEGASUS_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
-        >>> model = TFPegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') >>> tokenizer =
+    >>> model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
-        PegasusTokenizer.from_pretrained('google/pegasus-xsum')
+    >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
-        >>> ARTICLE_TO_SUMMARIZE = ( ... "PG&E stated it scheduled the blackouts in response to forecasts for high
+    >>> ARTICLE_TO_SUMMARIZE = (
-        winds " ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers
+    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
-        were " ... "scheduled to be affected by the shutoffs which were expected to last through at least midday
+    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
-        tomorrow." ... ) >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
+    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+    ... )
+    >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="tf")
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']) >>> print([tokenizer.decode(g,
+    >>> # Generate Summary
-        skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> summary_ids = model.generate(inputs["input_ids"])
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 PEGASUS_INPUTS_DOCSTRING = r"""

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
@@ -2605,8 +2605,9 @@ class Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(Flax{{coo
 FLAX_{{cookiecutter.uppercase_modelname}}_CONDITIONAL_GENERATION_DOCSTRING = """
    Returns:
-    Summarization example::
+    Summarization example:
+    ```python
    >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
    >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
@@ -2618,22 +2619,26 @@ FLAX_{{cookiecutter.uppercase_modelname}}_CONDITIONAL_GENERATION_DOCSTRING = """
    >>> # Generate Summary
    >>> summary_ids = model.generate(inputs['input_ids']).sequences
    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
-    Mask filling example::
+    Mask filling example:
+    ```python
    >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-        >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-        >>> TXT = "My friends are <mask> but they eat too many carbs."
    >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+    >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
    >>> input_ids = tokenizer([TXT], return_tensors='np')['input_ids']
-        >>> logits = model(input_ids).logits
+    >>> logits = model(input_ids).logits
    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
    >>> values, predictions = jax.lax.top_k(probs)
    >>> tokenizer.decode(predictions).split()
+    ```
 """
 overwrite_call_docstring(

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -2067,9 +2067,10 @@ class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
 """
 {{cookiecutter.uppercase_modelname}}_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
-        >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}Config
+    ```python
+    >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
    >>> model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
    >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
@@ -2078,8 +2079,9 @@ class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
    >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+    >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> print(tokenizer.decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""