Update repo to isort v5 (#6686)

* Run new isort * More changes * Update CI, CONTRIBUTING and benchmarks

Update repo to isort v5 (#6686)
* Run new isort * More changes * Update CI, CONTRIBUTING and benchmarks
a5737779 · Sylvain Gugger · GitHub · d329c9b0 · a5737779 · a5737779
Unverified Commit a5737779 authored Aug 24, 2020 by Sylvain Gugger Committed by GitHub Aug 24, 2020
20 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -235,8 +235,7 @@ jobs:
                      - v0.3-code_quality-{{ checksum "setup.py" }}
                      - v0.3-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
+            - run: pip install isort
-            - run: pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
            - run: pip install .[tf,torch,quality]
            - save_cache:
                  key: v0.3-code_quality-{{ checksum "setup.py" }}

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -134,12 +134,6 @@ Follow these steps to start contributing:
   it with `pip uninstall transformers` before reinstalling it in editable
   mode with the `-e` flag.)
-   Right now, we need an unreleased version of `isort` to avoid a
-   [bug](https://github.com/timothycrosley/isort/pull/1000):
-   ```bash
-   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-   ```
 5. Develop the features on your branch.
   As you work on the features, you should make sure that the test suite

--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
 quality:
 	black --check --line-length 119 --target-version py35 examples templates tests src utils
-	isort --check-only --recursive examples templates tests src utils
+	isort --check-only examples templates tests src utils
 	flake8 examples templates tests src utils
 	python utils/check_repo.py
@@ -12,7 +12,7 @@ quality:
 style:
 	black --line-length 119 --target-version py35 examples templates tests src utils
-	isort --recursive examples templates tests src utils
+	isort examples templates tests src utils
 # Run tests for the library

--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -20,8 +20,8 @@ from dataclasses import dataclass
 from typing import List, Optional, Union
 import tqdm
-from filelock import FileLock
+from filelock import FileLock
 from transformers import (
    BartTokenizer,
    BartTokenizerFast,

--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -26,8 +26,8 @@ from enum import Enum
 from typing import List, Optional
 import tqdm
-from filelock import FileLock
+from filelock import FileLock
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available

--- a/examples/seq2seq/bertabs/run_summarization.py
+++ b/examples/seq2seq/bertabs/run_summarization.py
@@ -44,9 +44,10 @@ def evaluate(args):
        reference_summaries = []
        generated_summaries = []
-        import rouge
        import nltk
+        import rouge
        nltk.download("punkt")
        rouge_evaluator = rouge.Rouge(
            metrics=["rouge-n", "rouge-l"],

--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@@ -15,27 +15,27 @@ from transformers import BartConfig, BartForConditionalGeneration, MBartTokenize
 try:
    from .finetune import SummarizationModule, TranslationModule
-    from .initialization_utils import init_student, copy_layers
+    from .finetune import main as ft_main
+    from .initialization_utils import copy_layers, init_student
    from .utils import (
-        use_task_specific_params,
-        pickle_load,
-        freeze_params,
-        assert_all_frozen,
        any_requires_grad,
+        assert_all_frozen,
        calculate_bleu_score,
+        freeze_params,
+        pickle_load,
+        use_task_specific_params,
    )
-    from .finetune import main as ft_main
 except ImportError:
    from finetune import SummarizationModule, TranslationModule
    from finetune import main as ft_main
-    from initialization_utils import init_student, copy_layers
+    from initialization_utils import copy_layers, init_student
    from utils import (
-        use_task_specific_params,
-        pickle_load,
-        freeze_params,
-        assert_all_frozen,
        any_requires_grad,
+        assert_all_frozen,
        calculate_bleu_score,
+        freeze_params,
+        pickle_load,
+        use_task_specific_params,
    )

--- a/examples/seq2seq/finetune.py
+++ b/examples/seq2seq/finetune.py
@@ -17,44 +17,43 @@ from transformers import MarianTokenizer, MBartTokenizer, T5ForConditionalGenera
 try:
+    from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
    from .utils import (
+        ROUGE_KEYS,
+        Seq2SeqDataset,
+        TranslationDataset,
        assert_all_frozen,
-        use_task_specific_params,
+        calculate_bleu_score,
-        lmap,
+        calculate_rouge,
        flatten_list,
-        pickle_save,
-        save_git_info,
-        save_json,
        freeze_params,
-        calculate_rouge,
        get_git_info,
-        ROUGE_KEYS,
-        calculate_bleu_score,
-        Seq2SeqDataset,
-        TranslationDataset,
        label_smoothed_nll_loss,
+        lmap,
+        pickle_save,
+        save_git_info,
+        save_json,
+        use_task_specific_params,
    )
-    from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 except ImportError:
+    from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
    from utils import (
+        ROUGE_KEYS,
        Seq2SeqDataset,
        TranslationDataset,
        assert_all_frozen,
-        use_task_specific_params,
+        calculate_bleu_score,
-        lmap,
+        calculate_rouge,
        flatten_list,
-        pickle_save,
-        save_git_info,
-        save_json,
        freeze_params,
-        calculate_rouge,
        get_git_info,
-        ROUGE_KEYS,
-        calculate_bleu_score,
        label_smoothed_nll_loss,
+        lmap,
+        pickle_save,
+        save_git_info,
+        save_json,
+        use_task_specific_params,
    )
-    from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 logger = logging.getLogger(__name__)

--- a/examples/seq2seq/run_eval.py
+++ b/examples/seq2seq/run_eval.py
@@ -9,9 +9,9 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 try:
-    from .utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch
+    from .utils import calculate_bleu_score, calculate_rouge, trim_batch, use_task_specific_params
 except ImportError:
-    from utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch
+    from utils import calculate_bleu_score, calculate_rouge, trim_batch, use_task_specific_params
 DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -35,8 +35,8 @@ sys.path.extend(SRC_DIRS)
 if SRC_DIRS is not None:
    import run_generation
    import run_glue
-    import run_pl_glue
    import run_language_modeling
+    import run_pl_glue
    import run_squad

--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -23,7 +23,6 @@ from enum import Enum
 from typing import List, Optional, Union
 from filelock import FileLock
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available

--- a/setup.cfg
+++ b/setup.cfg
 [isort]
+default_section = FIRSTPARTY
 ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True

--- a/setup.py
+++ b/setup.py
@@ -91,12 +91,7 @@ extras["all"] = extras["serving"] + ["tensorflow", "torch"]
 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"]
 # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
 extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
-extras["quality"] = [
+extras["quality"] = ["black", "isort >= 5", "flake8"]
-    "black",
-    # "isort",
-    "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
-    "flake8",
-]
 extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch"]
 setup(

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -189,241 +189,246 @@ if is_sklearn_available():
 # Modeling
 if is_torch_available():
+    # Benchmarks
+    from .benchmark.benchmark import PyTorchBenchmark
+    from .benchmark.benchmark_args import PyTorchBenchmarkArguments
+    from .data.data_collator import (
+        DataCollator,
+        DataCollatorForLanguageModeling,
+        DataCollatorForPermutationLanguageModeling,
+        DataCollatorWithPadding,
+        default_data_collator,
+    )
+    from .data.datasets import (
+        GlueDataset,
+        GlueDataTrainingArguments,
+        LineByLineTextDataset,
+        SquadDataset,
+        SquadDataTrainingArguments,
+        TextDataset,
+    )
    from .generation_utils import top_k_top_p_filtering
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, apply_chunking_to_forward
+    from .modeling_albert import (
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AlbertForMaskedLM,
+        AlbertForMultipleChoice,
+        AlbertForPreTraining,
+        AlbertForQuestionAnswering,
+        AlbertForSequenceClassification,
+        AlbertForTokenClassification,
+        AlbertModel,
+        AlbertPreTrainedModel,
+        load_tf_weights_in_albert,
+    )
    from .modeling_auto import (
-        AutoModel,
-        AutoModelForPreTraining,
-        AutoModelForSequenceClassification,
-        AutoModelForQuestionAnswering,
-        AutoModelWithLMHead,
-        AutoModelForCausalLM,
-        AutoModelForMaskedLM,
-        AutoModelForSeq2SeqLM,
-        AutoModelForTokenClassification,
-        AutoModelForMultipleChoice,
-        MODEL_MAPPING,
-        MODEL_FOR_PRETRAINING_MAPPING,
-        MODEL_WITH_LM_HEAD_MAPPING,
        MODEL_FOR_CAUSAL_LM_MAPPING,
        MODEL_FOR_MASKED_LM_MAPPING,
+        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        MODEL_MAPPING,
+        MODEL_WITH_LM_HEAD_MAPPING,
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForMaskedLM,
+        AutoModelForMultipleChoice,
+        AutoModelForPreTraining,
+        AutoModelForQuestionAnswering,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForTokenClassification,
+        AutoModelWithLMHead,
    )
+    from .modeling_bart import (
-    from .modeling_mobilebert import (
+        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MobileBertPreTrainedModel,
+        BartForConditionalGeneration,
-        MobileBertModel,
+        BartForQuestionAnswering,
-        MobileBertForPreTraining,
+        BartForSequenceClassification,
-        MobileBertForSequenceClassification,
+        BartModel,
-        MobileBertForQuestionAnswering,
+        PretrainedBartModel,
-        MobileBertForMaskedLM,
-        MobileBertForNextSentencePrediction,
-        MobileBertForMultipleChoice,
-        MobileBertForTokenClassification,
-        load_tf_weights_in_mobilebert,
-        MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MobileBertLayer,
    )
    from .modeling_bert import (
-        BertPreTrainedModel,
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        BertModel,
-        BertForPreTraining,
        BertForMaskedLM,
-        BertLMHeadModel,
+        BertForMultipleChoice,
        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
        BertForSequenceClassification,
-        BertForMultipleChoice,
        BertForTokenClassification,
-        BertForQuestionAnswering,
-        load_tf_weights_in_bert,
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        BertLayer,
+        BertLMHeadModel,
+        BertModel,
+        BertPreTrainedModel,
+        load_tf_weights_in_bert,
    )
-    from .modeling_openai import (
+    from .modeling_camembert import (
-        OpenAIGPTPreTrainedModel,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        OpenAIGPTModel,
+        CamembertForCausalLM,
-        OpenAIGPTLMHeadModel,
+        CamembertForMaskedLM,
-        OpenAIGPTDoubleHeadsModel,
+        CamembertForMultipleChoice,
-        load_tf_weights_in_openai_gpt,
+        CamembertForQuestionAnswering,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        CamembertForSequenceClassification,
+        CamembertForTokenClassification,
+        CamembertModel,
    )
-    from .modeling_transfo_xl import (
+    from .modeling_ctrl import CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, CTRLLMHeadModel, CTRLModel, CTRLPreTrainedModel
-        TransfoXLPreTrainedModel,
+    from .modeling_distilbert import (
-        TransfoXLModel,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TransfoXLLMHeadModel,
+        DistilBertForMaskedLM,
-        AdaptiveEmbedding,
+        DistilBertForMultipleChoice,
-        load_tf_weights_in_transfo_xl,
+        DistilBertForQuestionAnswering,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DistilBertForSequenceClassification,
+        DistilBertForTokenClassification,
+        DistilBertModel,
+        DistilBertPreTrainedModel,
+    )
+    from .modeling_dpr import (
+        DPRContextEncoder,
+        DPRPretrainedContextEncoder,
+        DPRPretrainedQuestionEncoder,
+        DPRPretrainedReader,
+        DPRQuestionEncoder,
+        DPRReader,
+    )
+    from .modeling_electra import (
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ElectraForMaskedLM,
+        ElectraForMultipleChoice,
+        ElectraForPreTraining,
+        ElectraForQuestionAnswering,
+        ElectraForSequenceClassification,
+        ElectraForTokenClassification,
+        ElectraModel,
+        ElectraPreTrainedModel,
+        load_tf_weights_in_electra,
+    )
+    from .modeling_encoder_decoder import EncoderDecoderModel
+    from .modeling_flaubert import (
+        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        FlaubertForMultipleChoice,
+        FlaubertForQuestionAnswering,
+        FlaubertForQuestionAnsweringSimple,
+        FlaubertForSequenceClassification,
+        FlaubertForTokenClassification,
+        FlaubertModel,
+        FlaubertWithLMHeadModel,
    )
    from .modeling_gpt2 import (
-        GPT2PreTrainedModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
-        GPT2Model,
-        GPT2LMHeadModel,
        GPT2DoubleHeadsModel,
+        GPT2LMHeadModel,
+        GPT2Model,
+        GPT2PreTrainedModel,
        load_tf_weights_in_gpt2,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
    )
-    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST
+    from .modeling_longformer import (
-    from .modeling_xlnet import (
+        LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-        XLNetPreTrainedModel,
+        LongformerForMaskedLM,
-        XLNetModel,
+        LongformerForMultipleChoice,
-        XLNetLMHeadModel,
+        LongformerForQuestionAnswering,
-        XLNetForSequenceClassification,
+        LongformerForSequenceClassification,
-        XLNetForTokenClassification,
+        LongformerForTokenClassification,
-        XLNetForMultipleChoice,
+        LongformerModel,
-        XLNetForQuestionAnsweringSimple,
+        LongformerSelfAttention,
-        XLNetForQuestionAnswering,
-        load_tf_weights_in_xlnet,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
    )
-    from .modeling_xlm import (
+    from .modeling_marian import MarianMTModel
-        XLMPreTrainedModel,
+    from .modeling_mbart import MBartForConditionalGeneration
-        XLMModel,
+    from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
-        XLMWithLMHeadModel,
+    from .modeling_mobilebert import (
-        XLMForSequenceClassification,
+        MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        XLMForTokenClassification,
+        MobileBertForMaskedLM,
-        XLMForQuestionAnswering,
+        MobileBertForMultipleChoice,
-        XLMForQuestionAnsweringSimple,
+        MobileBertForNextSentencePrediction,
-        XLMForMultipleChoice,
+        MobileBertForPreTraining,
-        XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MobileBertForQuestionAnswering,
+        MobileBertForSequenceClassification,
+        MobileBertForTokenClassification,
+        MobileBertLayer,
+        MobileBertModel,
+        MobileBertPreTrainedModel,
+        load_tf_weights_in_mobilebert,
+    )
+    from .modeling_openai import (
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        OpenAIGPTDoubleHeadsModel,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTModel,
+        OpenAIGPTPreTrainedModel,
+        load_tf_weights_in_openai_gpt,
    )
    from .modeling_pegasus import PegasusForConditionalGeneration
-    from .modeling_bart import (
+    from .modeling_reformer import (
-        PretrainedBartModel,
+        REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-        BartForSequenceClassification,
+        ReformerAttention,
-        BartModel,
+        ReformerForMaskedLM,
-        BartForConditionalGeneration,
+        ReformerForQuestionAnswering,
-        BartForQuestionAnswering,
+        ReformerForSequenceClassification,
-        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ReformerLayer,
+        ReformerModel,
+        ReformerModelWithLMHead,
    )
-    from .modeling_mbart import MBartForConditionalGeneration
+    from .modeling_retribert import RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, RetriBertModel, RetriBertPreTrainedModel
-    from .modeling_marian import MarianMTModel
-    from .tokenization_marian import MarianTokenizer
    from .modeling_roberta import (
-        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        RobertaForCausalLM,
-        RobertaModel,
+        RobertaForMaskedLM,
-        RobertaForSequenceClassification,
        RobertaForMultipleChoice,
-        RobertaForTokenClassification,
        RobertaForQuestionAnswering,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RobertaForSequenceClassification,
-    )
+        RobertaForTokenClassification,
-    from .modeling_distilbert import (
+        RobertaModel,
-        DistilBertPreTrainedModel,
-        DistilBertForMaskedLM,
-        DistilBertModel,
-        DistilBertForMultipleChoice,
-        DistilBertForSequenceClassification,
-        DistilBertForQuestionAnswering,
-        DistilBertForTokenClassification,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_camembert import (
-        CamembertForMaskedLM,
-        CamembertModel,
-        CamembertForSequenceClassification,
-        CamembertForMultipleChoice,
-        CamembertForTokenClassification,
-        CamembertForQuestionAnswering,
-        CamembertForCausalLM,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    )
-    from .modeling_encoder_decoder import EncoderDecoderModel
    from .modeling_t5 import (
-        T5PreTrainedModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_LIST,
-        T5Model,
        T5ForConditionalGeneration,
+        T5Model,
+        T5PreTrainedModel,
        load_tf_weights_in_t5,
-        T5_PRETRAINED_MODEL_ARCHIVE_LIST,
    )
-    from .modeling_albert import (
+    from .modeling_transfo_xl import (
-        AlbertPreTrainedModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
-        AlbertModel,
+        AdaptiveEmbedding,
-        AlbertForPreTraining,
+        TransfoXLLMHeadModel,
-        AlbertForMaskedLM,
+        TransfoXLModel,
-        AlbertForMultipleChoice,
+        TransfoXLPreTrainedModel,
-        AlbertForSequenceClassification,
+        load_tf_weights_in_transfo_xl,
-        AlbertForQuestionAnswering,
+    )
-        AlbertForTokenClassification,
+    from .modeling_utils import Conv1D, PreTrainedModel, apply_chunking_to_forward, prune_layer
-        load_tf_weights_in_albert,
+    from .modeling_xlm import (
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+        XLMForMultipleChoice,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLMForSequenceClassification,
+        XLMForTokenClassification,
+        XLMModel,
+        XLMPreTrainedModel,
+        XLMWithLMHeadModel,
    )
    from .modeling_xlm_roberta import (
+        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        XLMRobertaForMaskedLM,
-        XLMRobertaModel,
        XLMRobertaForMultipleChoice,
+        XLMRobertaForQuestionAnswering,
        XLMRobertaForSequenceClassification,
        XLMRobertaForTokenClassification,
-        XLMRobertaForQuestionAnswering,
+        XLMRobertaModel,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
-    from .modeling_flaubert import (
-        FlaubertModel,
-        FlaubertWithLMHeadModel,
-        FlaubertForSequenceClassification,
-        FlaubertForTokenClassification,
-        FlaubertForQuestionAnswering,
-        FlaubertForQuestionAnsweringSimple,
-        FlaubertForTokenClassification,
-        FlaubertForMultipleChoice,
-        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_electra import (
-        ElectraForPreTraining,
-        ElectraForMaskedLM,
-        ElectraForTokenClassification,
-        ElectraPreTrainedModel,
-        ElectraForMultipleChoice,
-        ElectraForSequenceClassification,
-        ElectraForQuestionAnswering,
-        ElectraModel,
-        load_tf_weights_in_electra,
-        ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_reformer import (
-        ReformerAttention,
-        ReformerLayer,
-        ReformerModel,
-        ReformerForMaskedLM,
-        ReformerModelWithLMHead,
-        ReformerForSequenceClassification,
-        ReformerForQuestionAnswering,
-        REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_longformer import (
-        LongformerModel,
-        LongformerForMaskedLM,
-        LongformerForSequenceClassification,
-        LongformerForMultipleChoice,
-        LongformerForTokenClassification,
-        LongformerForQuestionAnswering,
-        LongformerSelfAttention,
-        LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from .modeling_dpr import (
-        DPRPretrainedContextEncoder,
-        DPRPretrainedQuestionEncoder,
-        DPRPretrainedReader,
-        DPRContextEncoder,
-        DPRQuestionEncoder,
-        DPRReader,
    )
-    from .modeling_retribert import (
+    from .modeling_xlnet import (
-        RetriBertPreTrainedModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
-        RetriBertModel,
+        XLNetForMultipleChoice,
-        RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        XLNetForQuestionAnswering,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetLMHeadModel,
+        XLNetModel,
+        XLNetPreTrainedModel,
+        load_tf_weights_in_xlnet,
    )
    # Optimization
@@ -436,78 +441,55 @@ if is_torch_available():
        get_linear_schedule_with_warmup,
        get_polynomial_decay_schedule_with_warmup,
    )
+    from .tokenization_marian import MarianTokenizer
    # Trainer
-    from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
+    from .trainer import EvalPrediction, Trainer, set_seed, torch_distributed_zero_first
-    from .data.data_collator import (
-        default_data_collator,
-        DataCollator,
-        DataCollatorForLanguageModeling,
-        DataCollatorForPermutationLanguageModeling,
-        DataCollatorWithPadding,
-    )
-    from .data.datasets import (
-        GlueDataset,
-        TextDataset,
-        LineByLineTextDataset,
-        GlueDataTrainingArguments,
-        SquadDataset,
-        SquadDataTrainingArguments,
-    )
-    # Benchmarks
-    from .benchmark.benchmark import PyTorchBenchmark
-    from .benchmark.benchmark_args import PyTorchBenchmarkArguments
 # TensorFlow
 if is_tf_available():
+    from .benchmark.benchmark_args_tf import TensorFlowBenchmarkArguments
+    # Benchmarks
+    from .benchmark.benchmark_tf import TensorFlowBenchmark
    from .generation_tf_utils import tf_top_k_top_p_filtering
-    from .modeling_tf_utils import (
+    from .modeling_tf_albert import (
-        shape_list,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFPreTrainedModel,
+        TFAlbertForMaskedLM,
-        TFSequenceSummary,
+        TFAlbertForMultipleChoice,
-        TFSharedEmbeddings,
+        TFAlbertForPreTraining,
+        TFAlbertForQuestionAnswering,
+        TFAlbertForSequenceClassification,
+        TFAlbertForTokenClassification,
+        TFAlbertMainLayer,
+        TFAlbertModel,
+        TFAlbertPreTrainedModel,
    )
    from .modeling_tf_auto import (
-        TF_MODEL_MAPPING,
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
        TF_MODEL_FOR_PRETRAINING_MAPPING,
        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        TF_MODEL_MAPPING,
        TF_MODEL_WITH_LM_HEAD_MAPPING,
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForMaskedLM,
        TFAutoModelForMultipleChoice,
        TFAutoModelForPreTraining,
        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
        TFAutoModelForSequenceClassification,
        TFAutoModelForTokenClassification,
        TFAutoModelWithLMHead,
-        TFAutoModelForCausalLM,
-        TFAutoModelForMaskedLM,
-        TFAutoModelForSeq2SeqLM,
-    )
-    from .modeling_tf_albert import (
-        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFAlbertForMaskedLM,
-        TFAlbertForMultipleChoice,
-        TFAlbertForPreTraining,
-        TFAlbertForQuestionAnswering,
-        TFAlbertForSequenceClassification,
-        TFAlbertForTokenClassification,
-        TFAlbertMainLayer,
-        TFAlbertModel,
-        TFAlbertPreTrainedModel,
    )
    from .modeling_tf_bert import (
        TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFBertEmbeddings,
-        TFBertLMHeadModel,
        TFBertForMaskedLM,
        TFBertForMultipleChoice,
        TFBertForNextSentencePrediction,
@@ -515,28 +497,26 @@ if is_tf_available():
        TFBertForQuestionAnswering,
        TFBertForSequenceClassification,
        TFBertForTokenClassification,
+        TFBertLMHeadModel,
        TFBertMainLayer,
        TFBertModel,
        TFBertPreTrainedModel,
    )
    from .modeling_tf_camembert import (
        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFCamembertForMaskedLM,
-        TFCamembertModel,
        TFCamembertForMultipleChoice,
        TFCamembertForQuestionAnswering,
        TFCamembertForSequenceClassification,
        TFCamembertForTokenClassification,
+        TFCamembertModel,
    )
    from .modeling_tf_ctrl import (
        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFCTRLLMHeadModel,
        TFCTRLModel,
        TFCTRLPreTrainedModel,
    )
    from .modeling_tf_distilbert import (
        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFDistilBertForMaskedLM,
@@ -548,7 +528,6 @@ if is_tf_available():
        TFDistilBertModel,
        TFDistilBertPreTrainedModel,
    )
    from .modeling_tf_electra import (
        TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFElectraForMaskedLM,
@@ -560,17 +539,15 @@ if is_tf_available():
        TFElectraModel,
        TFElectraPreTrainedModel,
    )
    from .modeling_tf_flaubert import (
        TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFFlaubertForMultipleChoice,
        TFFlaubertForQuestionAnsweringSimple,
        TFFlaubertForSequenceClassification,
        TFFlaubertForTokenClassification,
-        TFFlaubertWithLMHeadModel,
        TFFlaubertModel,
+        TFFlaubertWithLMHeadModel,
    )
    from .modeling_tf_gpt2 import (
        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFGPT2DoubleHeadsModel,
@@ -579,29 +556,26 @@ if is_tf_available():
        TFGPT2Model,
        TFGPT2PreTrainedModel,
    )
    from .modeling_tf_longformer import (
        TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFLongformerModel,
        TFLongformerForMaskedLM,
        TFLongformerForQuestionAnswering,
+        TFLongformerModel,
        TFLongformerSelfAttention,
    )
    from .modeling_tf_mobilebert import (
        TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFMobileBertModel,
-        TFMobileBertPreTrainedModel,
-        TFMobileBertForPreTraining,
-        TFMobileBertForSequenceClassification,
-        TFMobileBertForQuestionAnswering,
        TFMobileBertForMaskedLM,
-        TFMobileBertForNextSentencePrediction,
        TFMobileBertForMultipleChoice,
+        TFMobileBertForNextSentencePrediction,
+        TFMobileBertForPreTraining,
+        TFMobileBertForQuestionAnswering,
+        TFMobileBertForSequenceClassification,
        TFMobileBertForTokenClassification,
        TFMobileBertMainLayer,
+        TFMobileBertModel,
+        TFMobileBertPreTrainedModel,
    )
    from .modeling_tf_openai import (
        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFOpenAIGPTDoubleHeadsModel,
@@ -610,7 +584,6 @@ if is_tf_available():
        TFOpenAIGPTModel,
        TFOpenAIGPTPreTrainedModel,
    )
    from .modeling_tf_roberta import (
        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFRobertaForMaskedLM,
@@ -622,14 +595,12 @@ if is_tf_available():
        TFRobertaModel,
        TFRobertaPreTrainedModel,
    )
    from .modeling_tf_t5 import (
        TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFT5ForConditionalGeneration,
        TFT5Model,
        TFT5PreTrainedModel,
    )
    from .modeling_tf_transfo_xl import (
        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFAdaptiveEmbedding,
@@ -638,19 +609,18 @@ if is_tf_available():
        TFTransfoXLModel,
        TFTransfoXLPreTrainedModel,
    )
+    from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, shape_list
    from .modeling_tf_xlm import (
        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFXLMForMultipleChoice,
        TFXLMForQuestionAnsweringSimple,
        TFXLMForSequenceClassification,
        TFXLMForTokenClassification,
-        TFXLMWithLMHeadModel,
        TFXLMMainLayer,
        TFXLMModel,
        TFXLMPreTrainedModel,
+        TFXLMWithLMHeadModel,
    )
    from .modeling_tf_xlm_roberta import (
        TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFXLMRobertaForMaskedLM,
@@ -660,7 +630,6 @@ if is_tf_available():
        TFXLMRobertaForTokenClassification,
        TFXLMRobertaModel,
    )
    from .modeling_tf_xlnet import (
        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFXLNetForMultipleChoice,
@@ -674,20 +643,11 @@ if is_tf_available():
    )
    # Optimization
-    from .optimization_tf import (
+    from .optimization_tf import AdamWeightDecay, GradientAccumulator, WarmUp, create_optimizer
-        AdamWeightDecay,
-        create_optimizer,
-        GradientAccumulator,
-        WarmUp,
-    )
    # Trainer
    from .trainer_tf import TFTrainer
-    # Benchmarks
-    from .benchmark.benchmark_tf import TensorFlowBenchmark
-    from .benchmark.benchmark_args_tf import TensorFlowBenchmarkArguments
 if not is_tf_available() and not is_torch_available():
    logger.warning(

--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -22,14 +22,9 @@ import logging
 import timeit
 from typing import Callable, Optional
-from transformers import (
+from ..configuration_utils import PretrainedConfig
-    MODEL_MAPPING,
+from ..file_utils import is_py3nvml_available, is_torch_available
-    MODEL_WITH_LM_HEAD_MAPPING,
+from ..modeling_auto import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING
-    PretrainedConfig,
-    is_py3nvml_available,
-    is_torch_available,
-)
 from .benchmark_utils import (
    Benchmark,
    Memory,
@@ -42,6 +37,7 @@ from .benchmark_utils import (
 if is_torch_available():
    import torch
    from .benchmark_args import PyTorchBenchmarkArguments

--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -24,14 +24,9 @@ import timeit
 from functools import wraps
 from typing import Callable, Optional
-from transformers import (
+from ..configuration_utils import PretrainedConfig
-    TF_MODEL_MAPPING,
+from ..file_utils import is_py3nvml_available, is_tf_available
-    TF_MODEL_WITH_LM_HEAD_MAPPING,
+from ..modeling_tf_auto import TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING
-    PretrainedConfig,
-    is_py3nvml_available,
-    is_tf_available,
-)
 from .benchmark_utils import (
    Benchmark,
    Memory,
@@ -44,9 +39,10 @@ from .benchmark_utils import (
 if is_tf_available():
    import tensorflow as tf
-    from .benchmark_args_tf import TensorFlowBenchmarkArguments
    from tensorflow.python.framework.errors_impl import ResourceExhaustedError
+    from .benchmark_args_tf import TensorFlowBenchmarkArguments
 if is_py3nvml_available():
    import py3nvml.py3nvml as nvml

--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@@ -8,11 +8,11 @@ from transformers.pipelines import SUPPORTED_TASKS, pipeline
 try:
-    from uvicorn import run
+    from fastapi import Body, FastAPI, HTTPException
-    from fastapi import FastAPI, HTTPException, Body
    from fastapi.routing import APIRoute
    from pydantic import BaseModel
    from starlette.responses import JSONResponse
+    from uvicorn import run
    _serve_dependencies_installed = True
 except (ImportError, AttributeError):

--- a/src/transformers/commands/user.py
+++ b/src/transformers/commands/user.py
@@ -5,7 +5,6 @@ from getpass import getpass
 from typing import List, Union
 from requests.exceptions import HTTPError
 from transformers.commands import BaseTransformersCLICommand
 from transformers.hf_api import HfApi, HfFolder

--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -273,7 +273,9 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    try:
        import tensorflow as tf
-        from keras2onnx import convert_keras, save_model, __version__ as k2ov
+        from keras2onnx import __version__ as k2ov
+        from keras2onnx import convert_keras, save_model
        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")
@@ -340,7 +342,7 @@ def optimize(onnx_model_path: Path) -> Path:
    Returns: Path where the optimized model binary description has been saved
    """
-    from onnxruntime import SessionOptions, InferenceSession
+    from onnxruntime import InferenceSession, SessionOptions
    # Generate model name with suffix "optimized"
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
@@ -364,7 +366,7 @@ def quantize(onnx_model_path: Path) -> Path:
    """
    try:
        import onnx
-        from onnxruntime.quantization import quantize, QuantizationMode
+        from onnxruntime.quantization import QuantizationMode, quantize
        onnx_model = onnx.load(onnx_model_path.as_posix())

--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -78,28 +78,29 @@ from transformers.file_utils import hf_bucket_url
 if is_torch_available():
-    import torch
    import numpy as np
+    import torch
    from transformers import (
+        AlbertForPreTraining,
        BertForPreTraining,
        BertForQuestionAnswering,
        BertForSequenceClassification,
+        CamembertForMaskedLM,
+        CTRLLMHeadModel,
+        DistilBertForMaskedLM,
+        DistilBertForQuestionAnswering,
+        ElectraForPreTraining,
+        FlaubertWithLMHeadModel,
        GPT2LMHeadModel,
-        XLNetLMHeadModel,
-        XLMWithLMHeadModel,
-        XLMRobertaForMaskedLM,
-        TransfoXLLMHeadModel,
        OpenAIGPTLMHeadModel,
        RobertaForMaskedLM,
        RobertaForSequenceClassification,
-        CamembertForMaskedLM,
-        FlaubertWithLMHeadModel,
-        DistilBertForMaskedLM,
-        DistilBertForQuestionAnswering,
-        CTRLLMHeadModel,
-        AlbertForPreTraining,
        T5ForConditionalGeneration,
-        ElectraForPreTraining,
+        TransfoXLLMHeadModel,
+        XLMRobertaForMaskedLM,
+        XLMWithLMHeadModel,
+        XLNetLMHeadModel,
    )