Doc styler examples (#14953)

* Fix bad examples * Add black formatting to style_doc * Use first nonempty line * Put it at the right place * Don't add spaces to empty lines * Better templates * Deal with triple quotes in docstrings * Result of style_doc * Enable mdx treatment and fix code examples in MDXs * Result of doc styler on doc source files * Last fixes * Break copy from

Doc styler examples (#14953)
* Fix bad examples * Add black formatting to style_doc * Use first nonempty line * Put it at the right place * Don't add spaces to empty lines * Better templates * Deal with triple quotes in docstrings * Result of style_doc * Enable mdx treatment and fix code examples in MDXs * Result of doc styler on doc source files * Last fixes * Break copy from
b5e2b183 · Sylvain Gugger · GitHub · e13f72fb · b5e2b183 · b5e2b183
Unverified Commit b5e2b183 authored Dec 27, 2021 by Sylvain Gugger Committed by GitHub Dec 27, 2021
20 changed files
--- a/docs/source/model_doc/speech_to_text_2.mdx
+++ b/docs/source/model_doc/speech_to_text_2.mdx
@@ -58,11 +58,13 @@ predicted token ids.
 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)
@@ -81,7 +83,11 @@ predicted token ids.
 >>> from transformers import pipeline
 >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+>>> asr = pipeline(
+...     "automatic-speech-recognition",
+...     model="facebook/s2t-wav2vec2-large-en-de",
+...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
+... )
 >>> translation_de = asr(librispeech_en[0]["file"])
 ```

--- a/docs/source/model_doc/t5.mdx
+++ b/docs/source/model_doc/t5.mdx
@@ -98,8 +98,8 @@ language modeling head on top of the decoder.
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  model = T5ForConditionalGeneration.from_pretrained("t5-small")
-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+  input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+  labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
  # the forward function automatically creates the correct decoder_input_ids
  loss = model(input_ids=input_ids, labels=labels).loss
  ```
@@ -120,8 +120,8 @@ language modeling head on top of the decoder.
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  model = T5ForConditionalGeneration.from_pretrained("t5-small")
-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+  input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+  labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
  # the forward function automatically creates the correct decoder_input_ids
  loss = model(input_ids=input_ids, labels=labels).loss
  ```
@@ -148,7 +148,7 @@ language modeling head on top of the decoder.
  ignored. The code example below illustrates all of this.
  ```python
-  from transformers import T5Tokenizer, T5ForConditionalGeneration 
+  from transformers import T5Tokenizer, T5ForConditionalGeneration
  import torch
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
@@ -168,18 +168,19 @@ language modeling head on top of the decoder.
  # encode the inputs
  task_prefix = "translate English to French: "
  input_sequences = [input_sequence_1, input_sequence_2]
-  encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
+  encoding = tokenizer(
-                      padding='longest', 
+      [task_prefix + sequence for sequence in input_sequences],
-                      max_length=max_source_length, 
+      padding="longest",
-                      truncation=True, 
+      max_length=max_source_length,
-                      return_tensors="pt")
+      truncation=True,
+      return_tensors="pt",
+  )
  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
  # encode the targets
-  target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
+  target_encoding = tokenizer(
-                              padding='longest', 
+      [output_sequence_1, output_sequence_2], padding="longest", max_length=max_target_length, truncation=True
-                              max_length=max_target_length, 
+  )
-                              truncation=True)
  labels = target_encoding.input_ids
  # replace padding token id's of the labels by -100
@@ -218,12 +219,12 @@ There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encode
 generation works in general in encoder-decoder models.
 ```python
-from transformers import T5Tokenizer, T5ForConditionalGeneration 
+from transformers import T5Tokenizer, T5ForConditionalGeneration
 tokenizer = T5Tokenizer.from_pretrained("t5-small")
 model = T5ForConditionalGeneration.from_pretrained("t5-small")
-input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
 outputs = model.generate(input_ids)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # Das Haus ist wunderbar.
@@ -242,17 +243,17 @@ model = T5ForConditionalGeneration.from_pretrained("t5-small")
 # when generating, we will use the logits of right-most token to predict the next token
 # so the padding should be on the left
-tokenizer.padding_side = "left" 
+tokenizer.padding_side = "left"
-tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+tokenizer.pad_token = tokenizer.eos_token  # to avoid an error
-task_prefix = 'translate English to German: '
+task_prefix = "translate English to German: "
-sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+sentences = ["The house is wonderful.", "I like to work in NYC."]  # use different length sentences to test batching
 inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
 output_sequences = model.generate(
-    input_ids=inputs['input_ids'],
+    input_ids=inputs["input_ids"],
-    attention_mask=inputs['attention_mask'],
+    attention_mask=inputs["attention_mask"],
-    do_sample=False, # disable sampling to test if batching affects output
+    do_sample=False,  # disable sampling to test if batching affects output
 )
 print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

--- a/docs/source/model_doc/t5v1.1.mdx
+++ b/docs/source/model_doc/t5v1.1.mdx
@@ -22,7 +22,7 @@ One can directly plug in the weights of T5v1.1 into a T5 model, like so:
 ```python
 from transformers import T5ForConditionalGeneration
-model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')
+model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
 ```
 T5 Version 1.1 includes the following improvements compared to the original T5 model:

--- a/docs/source/model_doc/tapas.mdx
+++ b/docs/source/model_doc/tapas.mdx
--- a/docs/source/model_doc/visual_bert.mdx
+++ b/docs/source/model_doc/visual_bert.mdx
@@ -77,11 +77,13 @@ The following example shows how to get the last hidden state using [`VisualBertM
 >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
 >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
->>> inputs.update({
+>>> inputs.update(
-...     "visual_embeds": visual_embeds,
+...     {
-...     "visual_token_type_ids": visual_token_type_ids,
+...         "visual_embeds": visual_embeds,
-...     "visual_attention_mask": visual_attention_mask
+...         "visual_token_type_ids": visual_token_type_ids,
-... })
+...         "visual_attention_mask": visual_attention_mask,
+...     }
+... )
 >>> outputs = model(**inputs)
 >>> last_hidden_state = outputs.last_hidden_state
 ```

--- a/docs/source/model_sharing.mdx
+++ b/docs/source/model_sharing.mdx
@@ -50,9 +50,8 @@ For instance:
 ```python
 >>> model = AutoModel.from_pretrained(
->>>     "julien-c/EsperBERTo-small",
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
->>>     revision="v2.0.1" # tag name, or branch name, or commit hash
+... )
->>> )
 ```
 ## Push your model from Python
@@ -344,9 +343,8 @@ You may specify a revision by using the `revision` flag in the `from_pretrained`
 ```python
 >>> tokenizer = AutoTokenizer.from_pretrained(
->>>   "julien-c/EsperBERTo-small",
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
->>>   revision="v2.0.1" # tag name, or branch name, or commit hash
+... )
->>> )
 ```
 ## Workflow in a Colab notebook

--- a/docs/source/multilingual.mdx
+++ b/docs/source/multilingual.mdx
@@ -62,18 +62,18 @@ The different languages this model/tokenizer handles, as well as the ids of thes
 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
 ```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
 ```
 We should now define the language embedding by using the previously defined language id. We want to create a tensor
 filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
 ```py
->>> language_id = tokenizer.lang2id['en']  # 0
+>>> language_id = tokenizer.lang2id["en"]  # 0
 >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
 >>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
 ```
 You can then feed it all as input to your model:

--- a/docs/source/perplexity.mdx
+++ b/docs/source/perplexity.mdx
@@ -69,8 +69,9 @@ Let's demonstrate this process with GPT-2.
 ```python
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
-device = 'cuda'
-model_id = 'gpt2-large'
+device = "cuda"
+model_id = "gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
@@ -81,8 +82,9 @@ dataset in memory.
 ```python
 from datasets import load_dataset
-test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
 ```
 With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative
@@ -104,10 +106,10 @@ nlls = []
 for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
    begin_loc = max(i + stride - max_length, 0)
    end_loc = min(i + stride, encodings.input_ids.size(1))
-    trg_len = end_loc - i    # may be different from stride on last loop
+    trg_len = end_loc - i  # may be different from stride on last loop
-    input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
-    target_ids[:,:-trg_len] = -100
+    target_ids[:, :-trg_len] = -100
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

--- a/docs/source/preprocessing.mdx
+++ b/docs/source/preprocessing.mdx
@@ -36,7 +36,8 @@ To automatically download the vocab used during pretraining or fine-tuning a giv
 ```py
 from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 ```
 ## Base use
@@ -75,9 +76,7 @@ If you have several sentences you want to process, you can do this efficiently b
 tokenizer:
 ```py
->>> batch_sentences = ["Hello I'm a single sentence",
+>>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
-...                    "And another sentence",
-...                    "And the very very last one"]
 >>> encoded_inputs = tokenizer(batch_sentences)
 >>> print(encoded_inputs)
 {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
@@ -174,12 +173,12 @@ If you have a list of pairs of sequences you want to process, you should feed th
 list of first sentences and the list of second sentences:
 ```py
->>> batch_sentences = ["Hello I'm a single sentence",
+>>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
-...                    "And another sentence",
+>>> batch_of_second_sentences = [
-...                    "And the very very last one"]
+...     "I'm a sentence that goes with the first sentence",
->>> batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
+...     "And I should be encoded with the second sentence",
-...                              "And I should be encoded with the second sentence",
+...     "And I go with the very last one",
-...                              "And I go with the very last one"]
+... ]
 >>> encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
 >>> print(encoded_inputs)
 {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], 
@@ -199,7 +198,7 @@ To double-check what is fed to the model, we can decode each list in _input_ids_
 ```py
 >>> for ids in encoded_inputs["input_ids"]:
->>>     print(tokenizer.decode(ids))
+...     print(tokenizer.decode(ids))
 [CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]
 [CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP]
 [CLS] And the very very last one [SEP] And I go with the very last one [SEP]
@@ -307,35 +306,43 @@ This works exactly as before for batch of sentences or batch of pairs of sentenc
 like this:
 ```py
-batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
+batch_sentences = [
-                   ["And", "another", "sentence"],
+    ["Hello", "I'm", "a", "single", "sentence"],
-                   ["And", "the", "very", "very", "last", "one"]]
+    ["And", "another", "sentence"],
+    ["And", "the", "very", "very", "last", "one"],
+]
 encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
 ```
 or a batch of pair sentences like this:
 ```py
-batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
+batch_of_second_sentences = [
-                             ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
+    ["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
-                             ["And", "I", "go", "with", "the", "very", "last", "one"]]
+    ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
+    ["And", "I", "go", "with", "the", "very", "last", "one"],
+]
 encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
 ```
 And you can add padding, truncation as well as directly return tensors like before:
 ```py
-batch = tokenizer(batch_sentences,
+batch = tokenizer(
-                  batch_of_second_sentences,
+    batch_sentences,
-                  is_split_into_words=True,
+    batch_of_second_sentences,
-                  padding=True,
+    is_split_into_words=True,
-                  truncation=True,
+    padding=True,
-                  return_tensors="pt")
+    truncation=True,
+    return_tensors="pt",
+)
 ===PT-TF-SPLIT===
-batch = tokenizer(batch_sentences,
+batch = tokenizer(
-                  batch_of_second_sentences,
+    batch_sentences,
-                  is_split_into_words=True,
+    batch_of_second_sentences,
-                  padding=True,
+    is_split_into_words=True,
-                  truncation=True,
+    padding=True,
-                  return_tensors="tf")
+    truncation=True,
+    return_tensors="tf",
+)
 ```
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -57,7 +57,8 @@ pip install tensorflow
 ```py
 >>> from transformers import pipeline
->>> classifier = pipeline('sentiment-analysis')
+>>> classifier = pipeline("sentiment-analysis")
 ```
 When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
@@ -67,7 +68,7 @@ make them readable. For instance:
 ```py
->>> classifier('We are very happy to show you the 🤗 Transformers library.')
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
 [{'label': 'POSITIVE', 'score': 0.9998}]
 ```
@@ -75,8 +76,7 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
 a list of dictionaries like this one:
 ```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
-...            "We hope you don't hate it."])
 >>> for result in results:
 ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
 label: POSITIVE, with score: 0.9998
@@ -102,7 +102,7 @@ see how we can use it.
 You can directly pass the name of the model to use to [`pipeline`]:
 ```py
->>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+>>> classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
 ```
 This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
@@ -125,13 +125,13 @@ any other model from the model hub):
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
 >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 ===PT-TF-SPLIT===
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
 >>> # This model only exists in PyTorch, so we use the _from_pt_ flag to import that model in TensorFlow.
 >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 ```
 If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
@@ -150,11 +150,13 @@ As we saw, the model and tokenizer are created using the `from_pretrained` metho
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -199,7 +201,7 @@ and get tensors back. You can specify all of that to the tokenizer:
 ...     padding=True,
 ...     truncation=True,
 ...     max_length=512,
-...     return_tensors="pt"
+...     return_tensors="pt",
 ... )
 ===PT-TF-SPLIT===
 >>> tf_batch = tokenizer(
@@ -207,7 +209,7 @@ and get tensors back. You can specify all of that to the tokenizer:
 ...     padding=True,
 ...     truncation=True,
 ...     max_length=512,
-...     return_tensors="tf"
+...     return_tensors="tf",
 ... )
 ```
@@ -267,9 +269,11 @@ Let's apply the SoftMax activation to get predictions.
 ```py
 >>> from torch import nn
 >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
 ===PT-TF-SPLIT===
 >>> import tensorflow as tf
 >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
 ```
@@ -291,13 +295,15 @@ attribute:
 ```py
 >>> import torch
->>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+>>> pt_outputs = pt_model(**pt_batch, labels=torch.tensor([1, 0]))
 >>> print(pt_outputs)
 SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
 ===PT-TF-SPLIT===
 >>> import tensorflow as tf
->>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+>>> tf_outputs = tf_model(tf_batch, labels=tf.constant([1, 0]))
 >>> print(tf_outputs)
 TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[-4.0833 ,  4.3364  ],
@@ -317,11 +323,11 @@ case the attributes not set (that have `None` values) are ignored.
 Once your model is fine-tuned, you can save it with its tokenizer in the following way:
 ```py
->>> pt_save_directory = './pt_save_pretrained'
+>>> pt_save_directory = "./pt_save_pretrained"
 >>> tokenizer.save_pretrained(pt_save_directory)
 >>> pt_model.save_pretrained(pt_save_directory)
 ===PT-TF-SPLIT===
->>> tf_save_directory = './tf_save_pretrained'
+>>> tf_save_directory = "./tf_save_pretrained"
 >>> tokenizer.save_pretrained(tf_save_directory)
 >>> tf_model.save_pretrained(tf_save_directory)
 ```
@@ -343,10 +349,12 @@ Then, use the corresponding Auto class to load it like this:
 ```py
 >>> from transformers import AutoModel
 >>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
 >>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)
 ===PT-TF-SPLIT===
 >>> from transformers import TFAutoModel
 >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
 >>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
 ```
@@ -356,11 +364,11 @@ Lastly, you can also ask the model to return all hidden states and all attention
 ```py
 >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
->>> all_hidden_states  = pt_outputs.hidden_states 
+>>> all_hidden_states = pt_outputs.hidden_states
 >>> all_attentions = pt_outputs.attentions
 ===PT-TF-SPLIT===
 >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
->>> all_hidden_states =  tf_outputs.hidden_states
+>>> all_hidden_states = tf_outputs.hidden_states
 >>> all_attentions = tf_outputs.attentions
 ```
@@ -376,11 +384,13 @@ directly instantiate model and tokenizer without the auto magic:
 ```py
 >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
@@ -401,13 +411,15 @@ the model from scratch. Therefore, we instantiate the model from a configuration
 ```py
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
->>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
->>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
+>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 >>> model = DistilBertForSequenceClassification(config)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
->>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
->>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
+>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 >>> model = TFDistilBertForSequenceClassification(config)
 ```
@@ -419,11 +431,13 @@ configuration appropriately:
 ```py
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
 >>> model_name = "distilbert-base-uncased"
 >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
 >>> model_name = "distilbert-base-uncased"
 >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)

--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -109,6 +109,7 @@ This export can now be used in the ONNX inference runtime:
 import onnxruntime as ort
 from transformers import BertTokenizerFast
 tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
 ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
@@ -382,7 +383,7 @@ tokenized_text = enc.tokenize(text)
 # Masking one of the input tokens
 masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
+tokenized_text[masked_index] = "[MASK]"
 indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
@@ -393,8 +394,14 @@ dummy_input = [tokens_tensor, segments_tensors]
 # Initializing the model with the torchscript flag
 # Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+config = BertConfig(
-    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
 # Instantiating the model
 model = BertModel(config)

--- a/docs/source/task_summary.mdx
+++ b/docs/source/task_summary.mdx
@@ -188,11 +188,15 @@ positions of the extracted answer in the text.
 ```py
 >>> result = question_answerer(question="What is extractive question answering?", context=context)
->>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+>>> print(
+...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
+... )
 Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95
 >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
->>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+>>> print(
+...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
+... )
 Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160
 ```
@@ -232,18 +236,20 @@ Here is an example of question answering using a model and a tokenizer. The proc
 >>> for question in questions:
 ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
 ...     input_ids = inputs["input_ids"].tolist()[0]
-...
 ...     outputs = model(**inputs)
 ...     answer_start_scores = outputs.start_logits
 ...     answer_end_scores = outputs.end_logits
-...
 ...     # Get the most likely beginning of answer with the argmax of the score
 ...     answer_start = torch.argmax(answer_start_scores)
-...     # Get the most likely end of answer with the argmax of the score 
+...     # Get the most likely end of answer with the argmax of the score
 ...     answer_end = torch.argmax(answer_end_scores) + 1
-...
-...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+...     answer = tokenizer.convert_tokens_to_string(
-...
+...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
+...     )
 ...     print(f"Question: {question}")
 ...     print(f"Answer: {answer}")
 Question: How many pretrained models are available in 🤗 Transformers?
@@ -275,18 +281,20 @@ Answer: tensorflow 2. 0 and pytorch
 >>> for question in questions:
 ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
 ...     input_ids = inputs["input_ids"].numpy()[0]
-...
 ...     outputs = model(inputs)
 ...     answer_start_scores = outputs.start_logits
 ...     answer_end_scores = outputs.end_logits
-...
 ...     # Get the most likely beginning of answer with the argmax of the score
 ...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
 ...     # Get the most likely end of answer with the argmax of the score
 ...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
-...
-...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+...     answer = tokenizer.convert_tokens_to_string(
-...
+...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
+...     )
 ...     print(f"Question: {question}")
 ...     print(f"Answer: {answer}")
 Question: How many pretrained models are available in 🤗 Transformers?
@@ -327,7 +335,12 @@ This outputs the sequences with the mask filled, the confidence score, and the t
 ```py
 >>> from pprint import pprint
->>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+>>> pprint(
+...     unmasker(
+...         f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."
+...     )
+... )
 [{'score': 0.1793,
  'sequence': 'HuggingFace is creating a tool that the community uses to solve '
              'NLP tasks.',
@@ -374,8 +387,10 @@ Here is an example of doing masked language modeling using a model and a tokeniz
 >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
->>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+>>> sequence = (
+...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
 ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+... )
 >>> inputs = tokenizer(sequence, return_tensors="pt")
 >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
@@ -399,8 +414,10 @@ Distilled models are smaller than the models they mimic. Using them instead of t
 >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
->>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+>>> sequence = (
+...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
 ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+... )
 >>> inputs = tokenizer(sequence, return_tensors="tf")
 >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
@@ -544,7 +561,7 @@ Below is an example of text generation using `XLNet` and its tokenizer, which in
 >>> prompt_length = len(tokenizer.decode(inputs[0]))
 >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
 >>> print(generated)
 Today the weather is really nice and I am planning ...
@@ -571,7 +588,7 @@ Today the weather is really nice and I am planning ...
 >>> prompt_length = len(tokenizer.decode(inputs[0]))
 >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
 >>> print(generated)
 Today the weather is really nice and I am planning ...
@@ -660,8 +677,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
 >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
->>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+>>> sequence = (
-...            "therefore very close to the Manhattan Bridge."
+...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
+...     "therefore very close to the Manhattan Bridge."
+... )
 >>> inputs = tokenizer(sequence, return_tensors="pt")
 >>> tokens = inputs.tokens()
@@ -675,8 +694,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
 >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
->>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+>>> sequence = (
-...            "therefore very close to the Manhattan Bridge."
+...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
+...     "therefore very close to the Manhattan Bridge."
+... )
 >>> inputs = tokenizer(sequence, return_tensors="tf")
 >>> tokens = inputs.tokens()
@@ -863,7 +884,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 >>> inputs = tokenizer(
 ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="pt"
+...     return_tensors="pt",
 ... )
 >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
@@ -877,7 +898,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 >>> inputs = tokenizer(
 ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="tf"
+...     return_tensors="tf",
 ... )
 >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)

--- a/docs/source/testing.mdx
+++ b/docs/source/testing.mdx
@@ -422,14 +422,14 @@ Let's depict the GPU requirements in the following table:
 For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
-```python
+```python no-style
 @require_torch_multi_gpu
 def test_example_with_multi_gpu():
 ```
 If a test requires `tensorflow` use the `require_tf` decorator. For example:
-```python
+```python no-style
 @require_tf
 def test_tf_thing_with_tensorflow():
 ```
@@ -437,7 +437,7 @@ def test_tf_thing_with_tensorflow():
 These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
 how to set it up:
-```python
+```python no-style
 @require_torch_gpu
 @slow
 def test_example_slow_on_gpu():
@@ -446,7 +446,7 @@ def test_example_slow_on_gpu():
 Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
 last for them to work correctly. Here is an example of the correct usage:
-```python
+```python no-style
 @parameterized.expand(...)
 @require_torch_multi_gpu
 def test_integration_foo():
@@ -461,7 +461,8 @@ Inside tests:
 ```python
 from transformers.testing_utils import get_gpu_count
-n_gpu = get_gpu_count() # works with torch and tf
+n_gpu = get_gpu_count()  # works with torch and tf
 ```
 ### Distributed training
@@ -544,12 +545,16 @@ the test, but then there is no way of running that test for just one set of argu
 # test_this1.py
 import unittest
 from parameterized import parameterized
 class TestMathUnitTest(unittest.TestCase):
-    @parameterized.expand([
+    @parameterized.expand(
-        ("negative", -1.5, -2.0),
+        [
-        ("integer", 1, 1.0),
+            ("negative", -1.5, -2.0),
-        ("large fraction", 1.6, 1),
+            ("integer", 1, 1.0),
-    ])
+            ("large fraction", 1.6, 1),
+        ]
+    )
    def test_floor(self, name, input, expected):
        assert_equal(math.floor(input), expected)
 ```
@@ -601,6 +606,8 @@ Here is the same example, this time using `pytest`'s `parametrize` marker:
 ```python
 # test_this2.py
 import pytest
 @pytest.mark.parametrize(
    "name, input, expected",
    [
@@ -669,6 +676,8 @@ To start using those all you need is to make sure that the test resides in a sub
 ```python
 from transformers.testing_utils import TestCasePlus
 class PathExampleTest(TestCasePlus):
    def test_something_involving_local_locations(self):
        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
@@ -679,6 +688,8 @@ If you don't need to manipulate paths via `pathlib` or you just need a path as a
 ```python
 from transformers.testing_utils import TestCasePlus
 class PathExampleTest(TestCasePlus):
    def test_something_involving_stringified_locations(self):
        examples_dir = self.examples_dir_str
@@ -700,6 +711,8 @@ Here is an example of its usage:
 ```python
 from transformers.testing_utils import TestCasePlus
 class ExamplesTests(TestCasePlus):
    def test_whatever(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
@@ -759,6 +772,7 @@ If you need to temporary override `sys.path` to import from another test for exa
 ```python
 import os
 from transformers.testing_utils import ExtendSysPath
 bindir = os.path.abspath(os.path.dirname(__file__))
 with ExtendSysPath(f"{bindir}/.."):
    from test_trainer import TrainerIntegrationCommon  # noqa
@@ -786,20 +800,20 @@ code that's buggy causes some bad state that will affect other tests, do not use
 - Here is how to skip whole test unconditionally:
-```python
+```python no-style
 @unittest.skip("this bug needs to be fixed")
 def test_feature_x():
 ```
 or via pytest:
-```python
+```python no-style
 @pytest.mark.skip(reason="this bug needs to be fixed")
 ```
 or the `xfail` way:
-```python
+```python no-style
 @pytest.mark.xfail
 def test_feature_x():
 ```
@@ -816,6 +830,7 @@ or the whole module:
 ```python
 import pytest
 if not pytest.config.getoption("--custom-flag"):
    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
 ```
@@ -835,21 +850,21 @@ docutils = pytest.importorskip("docutils", minversion="0.3")
 -  Skip a test based on a condition:
-```python
+```python no-style
 @pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
 def test_feature_x():
 ```
 or:
-```python
+```python no-style
 @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
 def test_feature_x():
 ```
 or skip the whole module:
-```python
+```python no-style
 @pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
 class TestClass():
    def test_feature_x(self):
@@ -863,7 +878,7 @@ The library of tests is ever-growing, and some of the tests take minutes to run,
 an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
 marked as in the example below:
-```python
+```python no-style
 from transformers.testing_utils import slow
 @slow
 def test_integration_foo():
@@ -878,8 +893,8 @@ RUN_SLOW=1 pytest tests
 Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
 `@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
-```python
+```python no-style
-@parameterized.expand(...)
+@parameteriz ed.expand(...)
 @slow
 def test_integration_foo():
 ```
@@ -935,13 +950,21 @@ In order to test functions that write to `stdout` and/or `stderr`, the test can
 ```python
 import sys
-def print_to_stdout(s): print(s)
-def print_to_stderr(s): sys.stderr.write(s)
+def print_to_stdout(s):
+    print(s)
+def print_to_stderr(s):
+    sys.stderr.write(s)
 def test_result_and_stdout(capsys):
    msg = "Hello"
    print_to_stdout(msg)
    print_to_stderr(msg)
-    out, err = capsys.readouterr() # consume the captured output streams
+    out, err = capsys.readouterr()  # consume the captured output streams
    # optional: if you want to replay the consumed streams:
    sys.stdout.write(out)
    sys.stderr.write(err)
@@ -954,10 +977,13 @@ And, of course, most of the time, `stderr` will come as a part of an exception,
 a case:
 ```python
-def raise_exception(msg): raise ValueError(msg)
+def raise_exception(msg):
+    raise ValueError(msg)
 def test_something_exception():
    msg = "Not a good value"
-    error = ''
+    error = ""
    try:
        raise_exception(msg)
    except Exception as e:
@@ -970,7 +996,12 @@ Another approach to capturing stdout is via `contextlib.redirect_stdout`:
 ```python
 from io import StringIO
 from contextlib import redirect_stdout
-def print_to_stdout(s): print(s)
+def print_to_stdout(s):
+    print(s)
 def test_result_and_stdout():
    msg = "Hello"
    buffer = StringIO()
@@ -993,6 +1024,7 @@ some `\r`'s in it or not, so it's a simple:
 ```python
 from transformers.testing_utils import CaptureStdout
 with CaptureStdout() as cs:
    function_that_writes_to_stdout()
 print(cs.out)
@@ -1002,17 +1034,19 @@ Here is a full test example:
 ```python
 from transformers.testing_utils import CaptureStdout
 msg = "Secret message\r"
 final = "Hello World"
 with CaptureStdout() as cs:
    print(msg + final)
-assert cs.out == final+"\n", f"captured: {cs.out}, expecting {final}"
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
 ```
 If you'd like to capture `stderr` use the `CaptureStderr` class instead:
 ```python
 from transformers.testing_utils import CaptureStderr
 with CaptureStderr() as cs:
    function_that_writes_to_stderr()
 print(cs.err)
@@ -1022,6 +1056,7 @@ If you need to capture both streams at once, use the parent `CaptureStd` class:
 ```python
 from transformers.testing_utils import CaptureStd
 with CaptureStd() as cs:
    function_that_writes_to_stdout_and_stderr()
 print(cs.err, cs.out)
@@ -1044,7 +1079,7 @@ logging.set_verbosity_info()
 logger = logging.get_logger("transformers.models.bart.tokenization_bart")
 with CaptureLogger(logger) as cl:
    logger.info(msg)
-assert cl.out, msg+"\n"
+assert cl.out, msg + "\n"
 ```
 ### Testing with environment variables
@@ -1054,6 +1089,8 @@ If you want to test the impact of environment variables for a specific test you
 ```python
 from transformers.testing_utils import mockenv
 class HfArgumentParserTest(unittest.TestCase):
    @mockenv(TRANSFORMERS_VERBOSITY="error")
    def test_env_override(self):
@@ -1065,6 +1102,8 @@ multiple local paths. A helper class `transformers.test_utils.TestCasePlus` come
 ```python
 from transformers.testing_utils import TestCasePlus
 class EnvExampleTest(TestCasePlus):
    def test_external_prog(self):
        env = self.get_env()
@@ -1089,16 +1128,20 @@ seed = 42
 # python RNG
 import random
 random.seed(seed)
 # pytorch RNGs
 import torch
 torch.manual_seed(seed)
 torch.backends.cudnn.deterministic = True
-if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
 # numpy RNG
 import numpy as np
 np.random.seed(seed)
 # tf RNG

--- a/docs/source/tokenizer_summary.mdx
+++ b/docs/source/tokenizer_summary.mdx
@@ -104,6 +104,7 @@ seen before, by decomposing them into known subwords. For instance, the [`~trans
 ```py
 >>> from transformers import BertTokenizer
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
@@ -117,6 +118,7 @@ As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously ex
 ```py
 >>> from transformers import XLNetTokenizer
 >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]

--- a/docs/source/training.mdx
+++ b/docs/source/training.mdx
@@ -74,6 +74,7 @@ However, we can instead apply these preprocessing steps to all the splits of our
 def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
 tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
 ```
@@ -82,8 +83,8 @@ You can learn more about the map method or the other ways to preprocess the data
 Next we will generate a small subset of the training and validation set, to enable faster training:
 ```python
-small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 
+small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
-small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) 
+small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
 full_train_dataset = tokenized_datasets["train"]
 full_eval_dataset = tokenized_datasets["test"]
 ```
@@ -130,9 +131,7 @@ Then we can instantiate a [`Trainer`] like this:
 ```python
 from transformers import Trainer
-trainer = Trainer(
+trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
-    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
-)
 ```
 To fine-tune our model, we just need to call
@@ -160,6 +159,7 @@ from datasets import load_metric
 metric = load_metric("accuracy")
 def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
@@ -322,12 +322,7 @@ from transformers import get_scheduler
 num_epochs = 3
 num_training_steps = num_epochs * len(train_dataloader)
-lr_scheduler = get_scheduler(
+lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
-    "linear",
-    optimizer=optimizer,
-    num_warmup_steps=0,
-    num_training_steps=num_training_steps
-)
 ```
 One last thing, we will want to use the GPU if we have access to one (otherwise training might take several hours
@@ -372,7 +367,7 @@ use a metric from the datasets library. Here we accumulate the predictions at ea
 result when the loop is finished.
 ```python
-metric= load_metric("accuracy")
+metric = load_metric("accuracy")
 model.eval()
 for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -488,15 +488,20 @@ class PretrainedConfig(PushToHubMixin):
        ```python
        # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
        # derived class: BertConfig
-        config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
+        config = BertConfig.from_pretrained(
-        config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+            "bert-base-uncased"
-        config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+        )  # Download configuration from huggingface.co and cache.
-        config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        config = BertConfig.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+        config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
+        config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
        assert config.output_attentions == True
-        config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
+        config, unused_kwargs = BertConfig.from_pretrained(
-                                                   foo=False, return_unused_kwargs=True)
+            "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        )
        assert config.output_attentions == True
-        assert unused_kwargs == {'foo': False}
+        assert unused_kwargs == {"foo": False}
        ```"""
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -588,6 +588,7 @@ class SquadProcessor(DataProcessor):
        ```python
        >>> import tensorflow_datasets as tfds
        >>> dataset = tfds.load("squad")
        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)

--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -107,7 +107,7 @@ class DebugUnderflowOverflow:
    given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
    ```python
-    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
    ```
    And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
@@ -121,7 +121,7 @@ class DebugUnderflowOverflow:
    You can also specify the batch number after which to stop the training, with :
    ```python
-    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
    ```
    This feature is mainly useful in the tracing mode, but you can use it for any mode.

--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -273,15 +273,22 @@ class FeatureExtractionMixin:
        ```python
        # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
        # derived class: *Wav2Vec2FeatureExtractor*
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+            "facebook/wav2vec2-base-960h"
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
+        )  # Download feature_extraction_config from huggingface.co and cache.
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("./test/saved_model/preprocessor_config.json")
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False
+        )
        assert feature_extractor.return_attention_mask is False
-        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
+        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained(
-                                                           foo=False, return_unused_kwargs=True)
+            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False, return_unused_kwargs=True
+        )
        assert feature_extractor.return_attention_mask is False
-        assert unused_kwargs == {'foo': False}
+        assert unused_kwargs == {"foo": False}
        ```"""
        feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -956,11 +956,11 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
+    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
    >>> outputs = model(**inputs, labels=labels)
    >>> loss = outputs.loss
@@ -975,11 +975,11 @@ PT_QUESTION_ANSWERING_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors='pt')
+    >>> inputs = tokenizer(question, text, return_tensors="pt")
    >>> start_positions = torch.tensor([1])
    >>> end_positions = torch.tensor([3])
@@ -997,11 +997,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
+    >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    >>> outputs = model(**inputs, labels=labels)
    >>> loss = outputs.loss
    >>> logits = outputs.logits
@@ -1013,11 +1013,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}', problem_type="multi_label_classification")
+    >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss
+    >>> labels = torch.tensor([[1, 1]], dtype=torch.float)  # need dtype=float for BCEWithLogitsLoss
    >>> outputs = model(**inputs, labels=labels)
    >>> loss = outputs.loss
    >>> logits = outputs.logits
@@ -1032,8 +1032,8 @@ PT_MASKED_LM_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
    >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
@@ -1051,8 +1051,8 @@ PT_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
    >>> outputs = model(**inputs)
@@ -1068,16 +1068,16 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import torch
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> choice0 = "It is eaten with a fork and a knife."
    >>> choice1 = "It is eaten while held in the hand."
-    >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
+    >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True)
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
-    >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
+    >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels)  # batch size is 1
    >>> # the linear classifier still needs to be trained
    >>> loss = outputs.loss
@@ -1092,8 +1092,8 @@ PT_CAUSAL_LM_SAMPLE = r"""
    >>> import torch
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
    >>> outputs = model(**inputs, labels=inputs["input_ids"])
@@ -1112,8 +1112,8 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
-    >>> processor = {processor_class}.from_pretrained('{checkpoint}')
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> # audio file is decoded on the fly
    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
@@ -1134,8 +1134,8 @@ PT_SPEECH_CTC_SAMPLE = r"""
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
-    >>> processor = {processor_class}.from_pretrained('{checkpoint}')
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> # audio file is decoded on the fly
    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
@@ -1164,8 +1164,8 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> # audio file is decoded on the fly
    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
@@ -1192,8 +1192,8 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> # audio file is decoded on the fly
    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
@@ -1216,8 +1216,8 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> # audio file is decoded on the fly
    >>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt")
@@ -1227,7 +1227,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> # the resulting embeddings can be used for cosine similarity-based retrieval
    >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
    >>> similarity = cosine_sim(embeddings[0], embeddings[1])
-    >>> threshold = 0.7 # the optimal threshold is dataset-dependent
+    >>> threshold = 0.7  # the optimal threshold is dataset-dependent
    >>> if similarity < threshold:
    ...     print("Speakers are not the same!")
    ```
@@ -1256,12 +1256,14 @@ TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
    >>> input_ids = inputs["input_ids"]
-    >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+    >>> inputs["labels"] = tf.reshape(
+    ...     tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))
+    >>> )  # Batch size 1
    >>> outputs = model(inputs)
    >>> loss = outputs.loss
@@ -1276,17 +1278,17 @@ TF_QUESTION_ANSWERING_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> input_dict = tokenizer(question, text, return_tensors='tf')
+    >>> input_dict = tokenizer(question, text, return_tensors="tf")
    >>> outputs = model(input_dict)
    >>> start_logits = outputs.start_logits
    >>> end_logits = outputs.end_logits
    >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-    >>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1])
+    >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
    ```
 """
@@ -1297,11 +1299,11 @@ TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1
    >>> outputs = model(inputs)
    >>> loss = outputs.loss
@@ -1316,8 +1318,8 @@ TF_MASKED_LM_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
    >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
@@ -1335,8 +1337,8 @@ TF_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
    >>> outputs = model(inputs)
@@ -1352,16 +1354,16 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> choice0 = "It is eaten with a fork and a knife."
    >>> choice1 = "It is eaten while held in the hand."
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='tf', padding=True)
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
    >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
-    >>> outputs = model(inputs) # batch size is 1
+    >>> outputs = model(inputs)  # batch size is 1
    >>> # the linear classifier still needs to be trained
    >>> logits = outputs.logits
@@ -1375,8 +1377,8 @@ TF_CAUSAL_LM_SAMPLE = r"""
    >>> from transformers import {processor_class}, {model_class}
    >>> import tensorflow as tf
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
    >>> outputs = model(inputs)
@@ -1401,10 +1403,10 @@ FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
    >>> outputs = model(**inputs)
    >>> logits = outputs.logits
@@ -1417,11 +1419,11 @@ FLAX_QUESTION_ANSWERING_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors='jax')
+    >>> inputs = tokenizer(question, text, return_tensors="jax")
    >>> outputs = model(**inputs)
    >>> start_scores = outputs.start_logits
@@ -1435,10 +1437,10 @@ FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
    >>> outputs = model(**inputs)
    >>> logits = outputs.logits
@@ -1451,10 +1453,10 @@ FLAX_MASKED_LM_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
-    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors='jax')
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
    >>> outputs = model(**inputs)
    >>> logits = outputs.logits
@@ -1467,10 +1469,10 @@ FLAX_BASE_MODEL_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
    >>> outputs = model(**inputs)
    >>> last_hidden_states = outputs.last_hidden_state
@@ -1483,15 +1485,15 @@ FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> choice0 = "It is eaten with a fork and a knife."
    >>> choice1 = "It is eaten while held in the hand."
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='jax', padding=True)
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
-    >>> outputs = model(**{{k: v[None, :] for k,v in encoding.items()}})
+    >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
    >>> logits = outputs.logits
    ```
@@ -1503,8 +1505,8 @@ FLAX_CAUSAL_LM_SAMPLE = r"""
    ```python
    >>> from transformers import {processor_class}, {model_class}
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)