Trainer (#3800)

* doc * [tests] Add sample files for a regression task * [HUGE] Trainer * Feedback from @sshleifer * Feedback from @thomwolf + logging tweak * [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes * [glue] Use default max_seq_length of 128 like before * [glue] move DataTrainingArguments around * [ner] Change interface of InputExample, and align run_{tf,pl} * Re-align the pl scripts a little bit * ner * [ner] Add integration test * Fix language_modeling with API tweak * [ci] Tweak loss target * Don't break console output * amp.initialize: model must be on right device before * [multiple-choice] update for Trainer * Re-align to 827d6d6e

Trainer (#3800)
* doc * [tests] Add sample files for a regression task * [HUGE] Trainer * Feedback from @sshleifer * Feedback from @thomwolf + logging tweak * [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes * [glue] Use default max_seq_length of 128 like before * [glue] move DataTrainingArguments around * [ner] Change interface of InputExample, and align run_{tf,pl} * Re-align the pl scripts a little bit * ner * [ner] Add integration test * Fix language_modeling with API tweak * [ci] Tweak loss target * Don't break console output * amp.initialize: model must be on right device before * [multiple-choice] update for Trainer * Re-align to 827d6d6e
dd9d483d · Julien Chaumond · GitHub · eb5601b0 · dd9d483d · dd9d483d
Unverified Commit dd9d483d authored Apr 21, 2020 by Julien Chaumond Committed by GitHub Apr 21, 2020
20 changed files
--- a/examples/tests_samples/GermEval/dev.txt
+++ b/examples/tests_samples/GermEval/dev.txt
+Gleich O
+darauf O
+entwirft O
+er O
+seine O
+Selbstdarstellung O
+" O
+Ecce B-OTH
+homo I-OTH
+" O
+in O
+enger O
+Auseinandersetzung O
+mit O
+diesem O
+Bild O
+Jesu B-PER
+. O
+1980 O
+kam O
+der O
+Crown B-OTH
+als O
+Versuch O
+von O
+Toyota B-ORG
+, O
+sich O
+in O
+der O
+Oberen O
+Mittelklasse O
+zu O
+etablieren O
+, O
+auch O
+nach O
+Deutschland B-LOC
+. O
+– O
+4:26 O
+# O
+Sometime B-OTH
+Ago/La I-OTH
+Fiesta I-OTH
+– O
+23:18 O
+Alle O
+Stücke O
+wurden O
+von O
+Corea B-PER
+komponiert O
+mit O
+Ausnahme O
+der O
+einleitenden O
+Improvisation O
+zu O
+Sometime B-OTH
+Ago I-OTH
+. O
+Bis O
+2013 O
+steigen O
+die O
+Mittel O
+aus O
+dem O
+EU-Budget B-ORGpart
+auf O
+rund O
+120 O
+Millionen O
+Euro B-OTH
+. O
+Daraus O
+entwickelte O
+sich O
+im O
+Rokoko B-OTH
+die O
+Sitte O
+des O
+gemeinsamen O
+Weinens O
+im O
+Theater O
+, O
+das O
+die O
+Standesgrenzen O
+innerhalb O
+des O
+Publikums O
+überbrücken O
+sollte O
+. O
+Die O
+Spinne O
+hatte O
+sie O
+mit O
+Seidenfäden O
+an O
+ihrem O
+Schwanz O
+gefesselt O
+und O
+nach O
+oben O
+gezogen O
+. O
+In O
+Deutschland B-LOC
+ist O
+nach O
+StGB O
+eine O
+Anwerbung O
+für O
+die O
+Fremdenlegion O
+strafbar O
+. O
+Am O
+Donnerstag O
+wird O
+sich O
+zeigen O
+, O
+ob O
+die O
+Idee O
+der O
+DLR-Forscher B-ORGpart
+funktioniert O
+. O
+Der O
+sechste O
+Lauf O
+der O
+ADAC B-ORG
+GT I-ORG
+Mastersstand O
+ganz O
+klar O
+im O
+Mittelpunkt O
+des O
+Motorsport-Wochenendes O
+auf O
+dem O
+Eurospeedway B-ORG
+Lausitz I-ORG
+. O
+Nach O
+den O
+schwächeren O
+Vorgaben O
+der O
+Wall B-ORG
+Street I-ORG
+vom O
+Vortag O
+setzten O
+die O
+deutschen B-LOCderiv
+Standardwerte O
+ihren O
+Konsolidierungskurs O
+fort O
+. O
+Kolb B-PER
+war O
+seit O
+1986 O
+im O
+Turnverein O
+als O
+Leiter O
+tätig O
+, O
+darunter O
+elf O
+Jahre O
+als O
+Hauptleiter O
+in O
+der O
+Männerriege O
+. O
--- a/examples/tests_samples/GermEval/labels.txt
+++ b/examples/tests_samples/GermEval/labels.txt
+B-LOC
+B-LOCderiv
+B-LOCpart
+B-ORG
+B-ORGderiv
+B-ORGpart
+B-OTH
+B-OTHderiv
+B-OTHpart
+B-PER
+B-PERderiv
+B-PERpart
+I-LOC
+I-LOCderiv
+I-LOCpart
+I-ORG
+I-ORGderiv
+I-ORGpart
+I-OTH
+I-OTHderiv
+I-OTHpart
+I-PER
+I-PERderiv
+I-PERpart
+O
--- a/examples/tests_samples/GermEval/train.txt
+++ b/examples/tests_samples/GermEval/train.txt
+Schartau B-PER
+sagte O
+dem O
+" O
+Tagesspiegel B-ORG
+" O
+vom O
+Freitag O
+, O
+Fischer B-PER
+sei O
+" O
+in O
+einer O
+Weise O
+aufgetreten O
+, O
+die O
+alles O
+andere O
+als O
+überzeugend O
+war O
+" O
+. O
+Firmengründer O
+Wolf B-PER
+Peter I-PER
+Bree I-PER
+arbeitete O
+Anfang O
+der O
+siebziger O
+Jahre O
+als O
+Möbelvertreter O
+, O
+als O
+er O
+einen O
+fliegenden O
+Händler O
+aus O
+dem O
+Libanon B-LOC
+traf O
+. O
+Ob O
+sie O
+dabei O
+nach O
+dem O
+Runden O
+Tisch O
+am O
+23. O
+April O
+in O
+Berlin B-LOC
+durch O
+ein O
+pädagogisches O
+Konzept O
+unterstützt O
+wird O
+, O
+ist O
+allerdings O
+zu O
+bezweifeln O
+. O
+Bayern B-ORG
+München I-ORG
+ist O
+wieder O
+alleiniger O
+Top- O
+Favorit O
+auf O
+den O
+Gewinn O
+der O
+deutschen B-LOCderiv
+Fußball-Meisterschaft O
+. O
+Dabei O
+hätte O
+der O
+tapfere O
+Schlussmann O
+allen O
+Grund O
+gehabt O
+, O
+sich O
+viel O
+früher O
+aufzuregen O
+. O
+ARD-Programmchef B-ORGpart
+Günter B-PER
+Struve I-PER
+war O
+wegen O
+eines O
+vierwöchigen O
+Urlaubs O
+für O
+eine O
+Stellungnahme O
+nicht O
+erreichbar O
+. O
+Alternativ O
+sollten O
+sich O
+die O
+Restaurantbetreiber O
+aus O
+Sicht O
+der O
+Solingerin B-LOCderiv
+zu O
+längeren O
+Öffnungszeiten O
+verpflichten O
+, O
+um O
+wartende O
+Kunden O
+aufzunehmen O
+. O
+Die O
+Deutsche B-ORG
+Flugsicherung I-ORG
+( O
+DFS B-ORG
+) O
+beschloss O
+ein O
+Flugverbot O
+für O
+alle O
+internationalen O
+Flughäfen O
+mit O
+Ausnahme O
+der O
+beiden O
+Berliner B-LOCderiv
+Flughäfen O
+bis O
+2.00 O
+Uhr O
+nachts O
+. O
+New O
+Small O
+Family O
+mit O
+E-Motor O
+: O
+Studie O
+E-Up O
+! O
+Eine O
+Schwachstelle O
+war O
+beispielsweise O
+der O
+Spiegelkasten O
+. O
+Denn O
+durch O
+den O
+Einsatz O
+moderner O
+Fahrzeugtechnik O
+( O
+Dieseltriebwagen O
+) O
+und O
+schalldämmender O
+Fenster O
+entsteht O
+keine O
+Einschränkung O
+der O
+Wohnqualität O
+. O
--- a/examples/tests_samples/STS-B/dev.tsv
+++ b/examples/tests_samples/STS-B/dev.tsv
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0000	none	none	A man with a hard hat is dancing.	A man wearing a hard hat is dancing.	5.000
+1	main-captions	MSRvid	2012test	0002	none	none	A young child is riding a horse.	A child is riding a horse.	4.750
+2	main-captions	MSRvid	2012test	0003	none	none	A man is feeding a mouse to a snake.	The man is feeding a mouse to the snake.	5.000
+3	main-captions	MSRvid	2012test	0007	none	none	A woman is playing the guitar.	A man is playing guitar.	2.400
+4	main-captions	MSRvid	2012test	0008	none	none	A woman is playing the flute.	A man is playing a flute.	2.750
+5	main-captions	MSRvid	2012test	0010	none	none	A woman is cutting an onion.	A man is cutting onions.	2.615
+6	main-captions	MSRvid	2012test	0015	none	none	A man is erasing a chalk board.	The man is erasing the chalk board.	5.000
+7	main-captions	MSRvid	2012test	0023	none	none	A woman is carrying a boy.	A woman is carrying her baby.	2.333
+8	main-captions	MSRvid	2012test	0027	none	none	Three men are playing guitars.	Three men are on stage playing guitars.	3.750
--- a/examples/tests_samples/STS-B/train.tsv
+++ b/examples/tests_samples/STS-B/train.tsv
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0001	none	none	A plane is taking off.	An air plane is taking off.	5.000
+1	main-captions	MSRvid	2012test	0004	none	none	A man is playing a large flute.	A man is playing a flute.	3.800
+2	main-captions	MSRvid	2012test	0005	none	none	A man is spreading shreded cheese on a pizza.	A man is spreading shredded cheese on an uncooked pizza.	3.800
+3	main-captions	MSRvid	2012test	0006	none	none	Three men are playing chess.	Two men are playing chess.	2.600
+4	main-captions	MSRvid	2012test	0009	none	none	A man is playing the cello.	A man seated is playing the cello.	4.250
+5	main-captions	MSRvid	2012test	0011	none	none	Some men are fighting.	Two men are fighting.	4.250
+6	main-captions	MSRvid	2012test	0012	none	none	A man is smoking.	A man is skating.	0.500
+7	main-captions	MSRvid	2012test	0013	none	none	The man is playing the piano.	The man is playing the guitar.	1.600
+8	main-captions	MSRvid	2012test	0014	none	none	A man is playing on a guitar and singing.	A woman is playing an acoustic guitar and singing.	2.200
--- a/examples/transformer_base.py
+++ b/examples/transformer_base.py
@@ -8,7 +8,6 @@ import pytorch_lightning as pl
 import torch
 from transformers import (
-    ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
    AdamW,
    AutoConfig,
    AutoModel,
@@ -20,15 +19,11 @@ from transformers import (
    AutoTokenizer,
    get_linear_schedule_with_warmup,
 )
-from transformers.modeling_auto import MODEL_MAPPING
 logger = logging.getLogger(__name__)
-ALL_MODELS = tuple(ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
-MODEL_CLASSES = tuple(m.model_type for m in MODEL_MAPPING)
 MODEL_MODES = {
    "base": AutoModel,
    "sequence-classification": AutoModelForSequenceClassification,
@@ -51,28 +46,25 @@ class BaseTransformer(pl.LightningModule):
    def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", **config_kwargs):
        "Initialize a model."
-        super(BaseTransformer, self).__init__()
+        super().__init__()
        self.hparams = hparams
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        self.hparams.model_type = self.hparams.model_type.lower()
+        self.config = AutoConfig.from_pretrained(
-        config = AutoConfig.from_pretrained(
            self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
            **({"num_labels": num_labels} if num_labels is not None else {}),
            cache_dir=cache_dir,
            **config_kwargs,
        )
-        tokenizer = AutoTokenizer.from_pretrained(
+        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-            do_lower_case=self.hparams.do_lower_case,
            cache_dir=cache_dir,
        )
-        model = MODEL_MODES[mode].from_pretrained(
+        self.model = MODEL_MODES[mode].from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-            config=config,
+            config=self.config,
            cache_dir=cache_dir,
        )
-        self.config, self.tokenizer, self.model = config, tokenizer, model
    def is_logger(self):
        return self.trainer.proc_rank <= 0
@@ -148,19 +140,12 @@ class BaseTransformer(pl.LightningModule):
    @staticmethod
    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_type",
-            default=None,
-            type=str,
-            required=True,
-            help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
-        )
        parser.add_argument(
            "--model_name_or_path",
            default=None,
            type=str,
            required=True,
-            help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+            help="Path to pretrained model or model identifier from huggingface.co/models",
        )
        parser.add_argument(
            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
@@ -177,9 +162,6 @@ class BaseTransformer(pl.LightningModule):
            type=str,
            help="Where do you want to store the pre-trained models downloaded from s3",
        )
-        parser.add_argument(
-            "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-        )
        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
@@ -252,8 +234,6 @@ def add_generic_args(parser, root_dir):
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
@@ -261,15 +241,6 @@ def generic_train(model: BaseTransformer, args: argparse.Namespace):
    # init model
    set_seed(args)
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))

--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
+""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
 import csv
@@ -21,48 +21,124 @@ import glob
 import json
 import logging
 import os
-from typing import List
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional
+import torch
 import tqdm
+from torch.utils.data.dataset import Dataset
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, torch_distributed_zero_first
 logger = logging.getLogger(__name__)
-class InputExample(object):
+@dataclass(frozen=True)
-    """A single training/test example for multiple choice"""
+class InputExample:
+    """
-    def __init__(self, example_id, question, contexts, endings, label=None):
+    A single training/test example for multiple choice
-        """Constructs a InputExample.
+    Args:
-        Args:
+        example_id: Unique id for the example.
-            example_id: Unique id for the example.
+        question: string. The untokenized text of the second sequence (question).
-            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-            question: string. The untokenized text of the second sequence (question).
+        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+        label: (Optional) string. The label of the example. This should be
-            label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
-            specified for train and dev examples, but not for test examples.
+    """
-        """
-        self.example_id = example_id
+    example_id: str
-        self.question = question
+    question: str
-        self.contexts = contexts
+    contexts: List[str]
-        self.endings = endings
+    endings: List[str]
-        self.label = label
+    label: Optional[str]
-class InputFeatures(object):
+@dataclass(frozen=True)
-    def __init__(self, example_id, choices_features, label):
+class InputFeatures:
-        self.example_id = example_id
+    """
-        self.choices_features = [
+    A single set of features of data.
-            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
+    Property names are the same names as the corresponding inputs to a model.
-            for input_ids, input_mask, segment_ids in choices_features
+    """
-        ]
-        self.label = label
+    example_id: str
+    input_ids: List[List[int]]
+    attention_mask: Optional[List[List[int]]]
+    token_type_ids: Optional[List[List[int]]]
+    label: Optional[int]
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+class MultipleChoiceDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+    features: List[InputFeatures]
+    def __init__(
+        self,
+        data_dir: str,
+        tokenizer: PreTrainedTokenizer,
+        task: str,
+        max_seq_length: Optional[int] = None,
+        overwrite_cache=False,
+        mode: Split = Split.train,
+        local_rank=-1,
+    ):
+        processor = processors[task]()
+        cached_features_file = os.path.join(
+            data_dir,
+            "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
+        )
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                logger.info(f"Loading features from cached file {cached_features_file}")
+                self.features = torch.load(cached_features_file)
+            else:
+                logger.info(f"Creating features from dataset file at {data_dir}")
+                label_list = processor.get_labels()
+                if mode == Split.dev:
+                    examples = processor.get_dev_examples(data_dir)
+                elif mode == Split.test:
+                    examples = processor.get_test_examples(data_dir)
+                else:
+                    examples = processor.get_train_examples(data_dir)
+                logger.info("Training examples: %s", len(examples))
+                # TODO clean up all this to leverage built-in features of tokenizers
+                self.features = convert_examples_to_features(
+                    examples,
+                    label_list,
+                    max_seq_length,
+                    tokenizer,
+                    pad_on_left=bool(tokenizer.padding_side == "left"),
+                    pad_token=tokenizer.pad_token_id,
+                    pad_token_segment_id=tokenizer.pad_token_type_id,
+                )
+                if local_rank in [-1, 0]:
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
-class DataProcessor(object):
+    def __len__(self):
+        return len(self.features)
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
+class DataProcessor:
    """Base class for data converters for multiple choice data sets."""
    def get_train_examples(self, data_dir):
@@ -311,7 +387,7 @@ def convert_examples_to_features(
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        choices_features = []
+        choices_inputs = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
            text_a = context
            if example.question.find("_") != -1:
@@ -321,7 +397,7 @@ def convert_examples_to_features(
                text_b = example.question + " " + ending
            inputs = tokenizer.encode_plus(
-                text_a, text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True
+                text_a, text_b, add_special_tokens=True, max_length=max_length, pad_to_max_length=True,
            )
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
@@ -330,41 +406,31 @@ def convert_examples_to_features(
                    "you need to try to use a bigger max seq length!"
                )
-            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+            choices_inputs.append(inputs)
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-            # Zero-pad up to the sequence length.
+        label = label_map[example.label]
-            padding_length = max_length - len(input_ids)
-            if pad_on_left:
-                input_ids = ([pad_token] * padding_length) + input_ids
-                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-            else:
-                input_ids = input_ids + ([pad_token] * padding_length)
-                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-            assert len(input_ids) == max_length
+        input_ids = [x["input_ids"] for x in choices_inputs]
-            assert len(attention_mask) == max_length
+        attention_mask = (
-            assert len(token_type_ids) == max_length
+            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
-            choices_features.append((input_ids, attention_mask, token_type_ids))
+        )
+        token_type_ids = (
+            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
+        )
-        label = label_map[example.label]
+        features.append(
+            InputFeatures(
+                example_id=example.example_id,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                label=label,
+            )
+        )
-        if ex_index < 2:
+    for f in features[:2]:
-            logger.info("*** Example ***")
+        logger.info("*** Example ***")
-            logger.info("race_id: {}".format(example.example_id))
+        logger.info("feature: %s" % f)
-            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
-                logger.info("label: {}".format(label))
-        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
    return features

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -31,6 +31,8 @@ from .benchmark_utils import (
    start_memory_tracing,
    stop_memory_tracing,
 )
+# Configurations
 from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig
 from .configuration_bart import BartConfig
@@ -46,8 +48,6 @@ from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, Open
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
 from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-# Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
 from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
@@ -121,6 +121,8 @@ from .pipelines import (
    TranslationPipeline,
    pipeline,
 )
+# Tokenizers
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from .tokenization_bart import BartTokenizer, MBartTokenizer
@@ -136,8 +138,6 @@ from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
-# Tokenizers
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
@@ -162,6 +162,7 @@ if is_torch_available():
        AutoModelForQuestionAnswering,
        AutoModelWithLMHead,
        AutoModelForTokenClassification,
+        AutoModelForMultipleChoice,
        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
        MODEL_MAPPING,
        MODEL_FOR_PRETRAINING_MAPPING,
@@ -169,6 +170,7 @@ if is_torch_available():
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
    )
    from .modeling_bert import (
@@ -320,6 +322,10 @@ if is_torch_available():
        get_linear_schedule_with_warmup,
    )
+    # Trainer
+    from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
+    from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling
+    from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
 # TensorFlow
 if is_tf_available():

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -87,7 +87,7 @@ class PretrainedConfig(object):
        self.architectures = kwargs.pop("architectures", None)
        self.finetuning_task = kwargs.pop("finetuning_task", None)
        self.num_labels = kwargs.pop("num_labels", 2)
-        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
+        self.id2label = kwargs.pop("id2label", {i: f"LABEL_{i}" for i in range(self.num_labels)})
        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())

--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, NewType, Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from ..tokenization_utils import PreTrainedTokenizer
+class DataCollator(ABC):
+    """
+    A `DataCollator` is responsible for batching
+    and pre-processing samples of data as requested by the training loop.
+    """
+    @abstractmethod
+    def collate_batch(self) -> Dict[str, torch.Tensor]:
+        """
+        Take a list of samples from a Dataset and collate them into a batch.
+        Returns:
+            A dictionary of tensors
+        """
+        pass
+InputDataClass = NewType("InputDataClass", Any)
+@dataclass
+class DefaultDataCollator(DataCollator):
+    """
+    Very simple data collator that:
+    - simply collates batches of dict-like objects
+    - Performs special handling for potential keys named:
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+    - does not do any additional preprocessing
+    i.e., Property names of the input object will be used as corresponding inputs to the model.
+    See glue and ner for example of how it's useful.
+    """
+    def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
+        # In this method we'll make the assumption that all `features` in the batch
+        # have the same attributes.
+        # So we will look at the first element as a proxy for what attributes exist
+        # on the whole batch.
+        first = features[0]
+        # Special handling for labels.
+        # Ensure that tensor is created with the correct type
+        # (it should be automatically the case, but let's make sure of it.)
+        if hasattr(first, "label") and first.label is not None:
+            if type(first.label) is int:
+                labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        elif hasattr(first, "label_ids") and first.label_ids is not None:
+            if type(first.label_ids[0]) is int:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        else:
+            batch = {}
+        # Handling of all other possible attributes.
+        # Again, we will use the first element to figure out which key/values are not None for this model.
+        for k, v in vars(first).items():
+            if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+                batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long)
+        return batch
+@dataclass
+class DataCollatorForLanguageModeling(DataCollator):
+    """
+    Data collator used for language modeling.
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+    """
+    tokenizer: PreTrainedTokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
+        batch = self._tensorize_batch(examples)
+        if self.mlm:
+            inputs, labels = self.mask_tokens(batch)
+            return {"input_ids": inputs, "masked_lm_labels": labels}
+        else:
+            return {"input_ids": batch, "labels": batch}
+    def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+            )
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
--- a/src/transformers/data/datasets/__init__.py
+++ b/src/transformers/data/datasets/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+from .glue import GlueDataset, GlueDataTrainingArguments
+from .language_modeling import LineByLineTextDataset, TextDataset
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+from typing import List, Optional
+import torch
+from torch.utils.data.dataset import Dataset
+from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_xlm_roberta import XLMRobertaTokenizer
+from ...trainer import torch_distributed_zero_first
+from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
+from ..processors.utils import InputFeatures
+logger = logging.getLogger(__name__)
+@dataclass
+class GlueDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    def __post_init__(self):
+        self.task_name = self.task_name.lower()
+class GlueDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+    args: GlueDataTrainingArguments
+    output_mode: str
+    features: List[InputFeatures]
+    def __init__(
+        self,
+        args: GlueDataTrainingArguments,
+        tokenizer: PreTrainedTokenizer,
+        limit_length: Optional[int] = None,
+        evaluate=False,
+        local_rank=-1,
+    ):
+        self.args = args
+        processor = glue_processors[args.task_name]()
+        self.output_mode = glue_output_modes[args.task_name]
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(
+            args.data_dir,
+            "cached_{}_{}_{}_{}".format(
+                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+            ),
+        )
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                self.features = torch.load(cached_features_file)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {args.data_dir}")
+                label_list = processor.get_labels()
+                if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
+                    RobertaTokenizer,
+                    RobertaTokenizerFast,
+                    XLMRobertaTokenizer,
+                ):
+                    # HACK(label indices are swapped in RoBERTa pretrained model)
+                    label_list[1], label_list[2] = label_list[2], label_list[1]
+                examples = (
+                    processor.get_dev_examples(args.data_dir)
+                    if evaluate
+                    else processor.get_train_examples(args.data_dir)
+                )
+                if limit_length is not None:
+                    examples = examples[:limit_length]
+                self.features = glue_convert_examples_to_features(
+                    examples,
+                    tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=label_list,
+                    output_mode=self.output_mode,
+                )
+                if local_rank in [-1, 0]:
+                    start = time.time()
+                    torch.save(self.features, cached_features_file)
+                    # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                    logger.info(
+                        f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                    )
+    def __len__(self):
+        return len(self.features)
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
+import logging
+import os
+import pickle
+import time
+import torch
+from torch.utils.data.dataset import Dataset
+from ...tokenization_utils import PreTrainedTokenizer
+from ...trainer import torch_distributed_zero_first
+logger = logging.getLogger(__name__)
+class TextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+    def __init__(
+        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1,
+    ):
+        assert os.path.isfile(file_path)
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+        )
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+                self.examples = []
+                with open(file_path, encoding="utf-8") as f:
+                    text = f.read()
+                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                    self.examples.append(
+                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
+                    )
+                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
+                # If your dataset is small, first you should loook for a bigger one :-) and second you
+                # can change this behavior by adding (model specific) padding.
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                )
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
+class LineByLineTextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info("Creating features from dataset file at %s", file_path)
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+        lines = lines[:50_000]
+        batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -17,6 +17,7 @@
 import logging
 import os
+from enum import Enum
 from typing import List, Optional, Union
 from ...file_utils import is_tf_available
@@ -153,6 +154,11 @@ def _glue_convert_examples_to_features(
    return features
+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -14,13 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import csv
 import dataclasses
 import json
 import logging
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union
 from ...file_utils import is_tf_available, is_torch_available
@@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available
 logger = logging.getLogger(__name__)
-@dataclass(frozen=False)
+@dataclass
 class InputExample:
    """
    A single training/test example for simple sequence classification.
@@ -50,42 +49,37 @@ class InputExample:
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
-class InputFeatures(object):
+@dataclass(frozen=True)
+class InputFeatures:
    """
    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
-        label: Label corresponding to the input
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
    """
-    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
+    input_ids: List[int]
-        self.input_ids = input_ids
+    attention_mask: Optional[List[int]] = None
-        self.attention_mask = attention_mask
+    token_type_ids: Optional[List[int]] = None
-        self.token_type_ids = token_type_ids
+    label: Optional[Union[int, float]] = None
-        self.label = label
-    def __repr__(self):
-        return str(self.to_json_string())
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self)) + "\n"
-class DataProcessor(object):
+class DataProcessor:
    """Base class for data converters for sequence classification data sets."""
    def get_example_from_tensor_dict(self, tensor_dict):

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -456,6 +456,11 @@ def get_from_cache(
    lock_path = cache_path + ".lock"
    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
        if resume_download:
            incomplete_path = cache_path + ".incomplete"
@@ -496,3 +501,50 @@ def get_from_cache(
            json.dump(meta, meta_file)
    return cache_path
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+    From tensorflow_datasets
+    Built-in in functools from Python 3.8.
+    """
+    def __get__(self, obj, objtype=None):
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+def torch_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_torch_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
+    return wrapper
+def tf_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_tf_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires TF.")
+    return wrapper
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -55,6 +55,7 @@ from .modeling_bart import (
 from .modeling_bert import (
    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    BertForMaskedLM,
+    BertForMultipleChoice,
    BertForPreTraining,
    BertForQuestionAnswering,
    BertForSequenceClassification,
@@ -64,6 +65,7 @@ from .modeling_bert import (
 from .modeling_camembert import (
    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    CamembertForMaskedLM,
+    CamembertForMultipleChoice,
    CamembertForSequenceClassification,
    CamembertForTokenClassification,
    CamembertModel,
@@ -96,6 +98,7 @@ from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTL
 from .modeling_roberta import (
    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM,
+    RobertaForMultipleChoice,
    RobertaForQuestionAnswering,
    RobertaForSequenceClassification,
    RobertaForTokenClassification,
@@ -114,12 +117,14 @@ from .modeling_xlm import (
 from .modeling_xlm_roberta import (
    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLMRobertaForMaskedLM,
+    XLMRobertaForMultipleChoice,
    XLMRobertaForSequenceClassification,
    XLMRobertaForTokenClassification,
    XLMRobertaModel,
 )
 from .modeling_xlnet import (
    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForMultipleChoice,
    XLNetForQuestionAnsweringSimple,
    XLNetForSequenceClassification,
    XLNetForTokenClassification,
@@ -259,7 +264,18 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
 )
-class AutoModel(object):
+MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
+    [
+        (CamembertConfig, CamembertForMultipleChoice),
+        (XLMRobertaConfig, XLMRobertaForMultipleChoice),
+        (RobertaConfig, RobertaForMultipleChoice),
+        (BertConfig, BertForMultipleChoice),
+        (XLNetConfig, XLNetForMultipleChoice),
+    ]
+)
+class AutoModel:
    r"""
        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
@@ -410,7 +426,7 @@ class AutoModel(object):
        )
-class AutoModelForPreTraining(object):
+class AutoModelForPreTraining:
    r"""
        :class:`~transformers.AutoModelForPreTraining` is a generic model class
        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
@@ -552,7 +568,7 @@ class AutoModelForPreTraining(object):
        )
-class AutoModelWithLMHead(object):
+class AutoModelWithLMHead:
    r"""
        :class:`~transformers.AutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
@@ -696,7 +712,7 @@ class AutoModelWithLMHead(object):
        )
-class AutoModelForSequenceClassification(object):
+class AutoModelForSequenceClassification:
    r"""
        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
@@ -843,7 +859,7 @@ class AutoModelForSequenceClassification(object):
        )
-class AutoModelForQuestionAnswering(object):
+class AutoModelForQuestionAnswering:
    r"""
        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
@@ -1126,3 +1142,55 @@ class AutoModelForTokenClassification:
                ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
            )
        )
+class AutoModelForMultipleChoice:
+    r"""
+        :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
+        that will be instantiated as one of the multiple choice model classes of the library
+        when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoModelForMultipleChoice is designed to be instantiated "
+            "using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForMultipleChoice.from_config(config)` methods."
+        )
+    @classmethod
+    def from_config(cls, config):
+        for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class(config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
+            )
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
+            )
+        )
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
+import json
+import logging
+import os
+import random
+import re
+import shutil
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import RandomSampler
+from tqdm import tqdm, trange
+from .data.data_collator import DataCollator, DefaultDataCollator
+from .modeling_utils import PreTrainedModel
+from .optimization import AdamW, get_linear_schedule_with_warmup
+from .training_args import TrainingArguments
+try:
+    from apex import amp
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+def is_apex_available():
+    return _has_apex
+try:
+    from torch.utils.tensorboard import SummaryWriter
+    _has_tensorboard = True
+except ImportError:
+    try:
+        from tensorboardX import SummaryWriter
+        _has_tensorboard = True
+    except ImportError:
+        _has_tensorboard = False
+def is_tensorboard_available():
+    return _has_tensorboard
+logger = logging.getLogger(__name__)
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # ^^ safe to call this function even if cuda is not available
+@contextmanager
+def torch_distributed_zero_first(local_rank: int):
+    """
+    Decorator to make all processes in distributed training wait for the first one (locally) to do something.
+    """
+    if local_rank not in [-1, 0]:
+        torch.distributed.barrier()
+    yield
+    if local_rank == 0:
+        torch.distributed.barrier()
+class EvalPrediction(NamedTuple):
+    """
+    Evaluation output (always contains labels), to be used
+    to compute metrics.
+    """
+    predictions: np.ndarray
+    label_ids: np.ndarray
+class PredictionOutput(NamedTuple):
+    predictions: np.ndarray
+    label_ids: Optional[np.ndarray]
+    metrics: Optional[Dict[str, float]]
+class TrainOutput(NamedTuple):
+    global_step: int
+    training_loss: float
+PREFIX_CHECKPOINT_DIR = "checkpoint"
+class Trainer:
+    """
+    Trainer is a simple but feature-complete training and eval loop for PyTorch,
+    optimized for Transformers.
+    """
+    model: PreTrainedModel
+    args: TrainingArguments
+    data_collator: DataCollator
+    train_dataset: Optional[Dataset]
+    eval_dataset: Optional[Dataset]
+    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
+    prediction_loss_only: bool
+    tb_writer: Optional["SummaryWriter"] = None
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        args: TrainingArguments,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Dataset] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        prediction_loss_only=False,
+    ):
+        """
+        Trainer is a simple but feature-complete training and eval loop for PyTorch,
+        optimized for Transformers.
+        Args:
+            prediction_loss_only:
+                (Optional) in evaluation and prediction, only return the loss
+        """
+        self.model = model
+        self.args = args
+        if data_collator is not None:
+            self.data_collator = data_collator
+        else:
+            self.data_collator = DefaultDataCollator()
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.compute_metrics = compute_metrics
+        self.prediction_loss_only = prediction_loss_only
+        if is_tensorboard_available() and self.args.local_rank in [-1, 0]:
+            self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
+        if not is_tensorboard_available():
+            logger.warning(
+                "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
+            )
+        set_seed(self.args.seed)
+        # Create output directory if needed
+        if self.args.local_rank in [-1, 0]:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+    def get_train_dataloader(self) -> DataLoader:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+        train_sampler = (
+            RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.train_batch_size,
+            sampler=train_sampler,
+            collate_fn=self.data_collator.collate_batch,
+        )
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        return DataLoader(
+            eval_dataset if eval_dataset is not None else self.eval_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            collate_fn=self.data_collator.collate_batch,
+        )
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        # We use the same batch_size as for eval.
+        return DataLoader(
+            test_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            collate_fn=self.data_collator.collate_batch,
+        )
+    def get_optimizers(
+        self, num_training_steps: int
+    ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
+        # Prepare optimizer and schedule (linear warmup and decay)
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.args.weight_decay,
+            },
+            {
+                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
+        )
+        return optimizer, scheduler
+    def train(self, model_path: Optional[str] = None):
+        """
+        Main training entry point.
+        Args:
+            model_path:
+                (Optional) Local path to model if model to train has been instantiated from a local path
+                If present, we will try reloading the optimizer/scheduler states from there.
+        """
+        train_dataloader = self.get_train_dataloader()
+        if self.args.max_steps > 0:
+            t_total = self.args.max_steps
+            num_train_epochs = (
+                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
+            )
+        else:
+            t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
+            num_train_epochs = self.args.num_train_epochs
+        optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)
+        # Check if saved optimizer or scheduler states exist
+        if (
+            model_path is not None
+            and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
+            and os.path.isfile(os.path.join(model_path, "scheduler.pt"))
+        ):
+            # Load in optimizer and scheduler states
+            optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt")))
+            scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))
+        model = self.model
+        model.to(self.args.device)
+        if self.args.fp16:
+            if not is_apex_available():
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level)
+        # multi-gpu training (should be after apex fp16 initialization)
+        if self.args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+        # Distributed training (should be after apex fp16 initialization)
+        if self.args.local_rank != -1:
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.args.local_rank],
+                output_device=self.args.local_rank,
+                find_unused_parameters=True,
+            )
+        if self.tb_writer is not None:
+            self.tb_writer.add_text("args", self.args.to_json_string())
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_dataloader.dataset))
+        logger.info("  Num Epochs = %d", num_train_epochs)
+        logger.info("  Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size)
+        logger.info(
+            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+            self.args.train_batch_size
+            * self.args.gradient_accumulation_steps
+            * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1),
+        )
+        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
+        logger.info("  Total optimization steps = %d", t_total)
+        global_step = 0
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        # Check if continuing training from a checkpoint
+        if model_path is not None:
+            # set global_step to global_step of last saved checkpoint from model path
+            try:
+                global_step = int(model_path.split("-")[-1].split("/")[0])
+                epochs_trained = global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps)
+                steps_trained_in_current_epoch = global_step % (
+                    len(train_dataloader) // self.args.gradient_accumulation_steps
+                )
+                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+                logger.info("  Continuing training from epoch %d", epochs_trained)
+                logger.info("  Continuing training from global step %d", global_step)
+                logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+            except ValueError:
+                global_step = 0
+                logger.info("  Starting fine-tuning.")
+        tr_loss = 0.0
+        logging_loss = 0.0
+        model.zero_grad()
+        train_iterator = trange(
+            epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0],
+        )
+        for epoch in train_iterator:
+            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
+            for step, inputs in enumerate(epoch_iterator):
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    continue
+                tr_loss += self._training_step(model, inputs, optimizer)
+                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    len(epoch_iterator) <= self.args.gradient_accumulation_steps
+                    and (step + 1) == len(epoch_iterator)
+                ):
+                    if self.args.fp16:
+                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm)
+                    else:
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
+                    optimizer.step()
+                    scheduler.step()
+                    model.zero_grad()
+                    global_step += 1
+                    if self.args.local_rank in [-1, 0]:
+                        if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (
+                            global_step == 1 and self.args.logging_first_step
+                        ):
+                            logs = {}
+                            if self.args.evaluate_during_training:
+                                results = self.evaluate()
+                                for key, value in results.items():
+                                    eval_key = "eval_{}".format(key)
+                                    logs[eval_key] = value
+                            loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
+                            learning_rate_scalar = scheduler.get_last_lr()[0]
+                            logs["learning_rate"] = learning_rate_scalar
+                            logs["loss"] = loss_scalar
+                            logging_loss = tr_loss
+                            if self.tb_writer:
+                                for k, v in logs.items():
+                                    self.tb_writer.add_scalar(k, v, global_step)
+                            epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
+                        if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
+                            # In all cases (even distributed/parallel), self.model is always a reference
+                            # to the model we want to save.
+                            if hasattr(model, "module"):
+                                assert model.module is self.model
+                            else:
+                                assert model is self.model
+                            # Save model checkpoint
+                            output_dir = os.path.join(self.args.output_dir, f"checkpoint-{global_step}")
+                            self.save_model(output_dir)
+                            self._rotate_checkpoints()
+                            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                            logger.info("Saving optimizer and scheduler states to %s", output_dir)
+                if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                    epoch_iterator.close()
+                    break
+            if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                train_iterator.close()
+                break
+        if self.tb_writer:
+            self.tb_writer.close()
+        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
+        return TrainOutput(global_step, tr_loss / global_step)
+    def _training_step(
+        self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer
+    ) -> float:
+        model.train()
+        for k, v in inputs.items():
+            inputs[k] = v.to(self.args.device)
+        outputs = model(**inputs)
+        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+        if self.args.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        return loss.item()
+    def is_world_master(self) -> bool:
+        """
+        This will be True only in one process, even in distributed mode,
+        even when training on multiple machines.
+        """
+        return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
+    def save_model(self, output_dir: Optional[str] = None):
+        """
+        Saving best-practices: if you use default names for the model,
+        you can reload it using from_pretrained().
+        Will only save from the master process.
+        """
+        if self.is_world_master():
+            self._save(output_dir)
+    def _save(self, output_dir: Optional[str] = None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info("Saving model checkpoint to %s", output_dir)
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, PreTrainedModel):
+            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
+        self.model.save_pretrained(output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+    def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]:
+        ordering_and_checkpoint_path = []
+        glob_checkpoints = Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*")
+        for path in glob_checkpoints:
+            if use_mtime:
+                ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+            else:
+                regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
+                if regex_match and regex_match.groups():
+                    ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+        checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+        return checkpoints_sorted
+    def _rotate_checkpoints(self, use_mtime=False) -> None:
+        if not self.args.save_total_limit:
+            return
+        if self.args.save_total_limit <= 0:
+            return
+        # Check if we should delete older checkpoint(s)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime)
+        if len(checkpoints_sorted) <= self.args.save_total_limit:
+            return
+        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit)
+        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+        for checkpoint in checkpoints_to_be_deleted:
+            logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+            shutil.rmtree(checkpoint)
+    def evaluate(
+        self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and return metrics.
+        The calling script will be responsible for providing a method to compute metrics, as they are
+        task-dependent.
+        Args:
+            eval_dataset: (Optional) Pass a dataset if you wish to override
+            the one on the instance.
+        Returns:
+            A dict containing:
+                - the eval loss
+                - the potential metrics computed from the predictions
+        """
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        output = self._prediction_loop(eval_dataloader, description="Evaluation")
+        return output.metrics
+    def predict(self, test_dataset: Dataset) -> PredictionOutput:
+        """
+        Run prediction and return predictions and potential metrics.
+        Depending on the dataset and your use case, your test dataset may contain labels.
+        In that case, this method will also return metrics, like in evaluate().
+        """
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        return self._prediction_loop(test_dataloader, description="Prediction")
+    def _prediction_loop(
+        self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
+    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+        Works both with or without labels.
+        """
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
+        # multi-gpu eval
+        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(self.model)
+        else:
+            model = self.model
+        model.to(self.args.device)
+        logger.info("***** Running %s *****", description)
+        logger.info("  Num examples = %d", len(dataloader.dataset))
+        logger.info("  Batch size = %d", dataloader.batch_size)
+        eval_losses: List[float] = []
+        preds: np.ndarray = None
+        label_ids: np.ndarray = None
+        model.eval()
+        for inputs in tqdm(dataloader, desc=description):
+            has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"])
+            for k, v in inputs.items():
+                inputs[k] = v.to(self.args.device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if has_labels:
+                    step_eval_loss, logits = outputs[:2]
+                    eval_losses += [step_eval_loss.mean().item()]
+                else:
+                    logits = outputs[0]
+            if not prediction_loss_only:
+                if preds is None:
+                    preds = logits.detach().cpu().numpy()
+                else:
+                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                if inputs.get("labels") is not None:
+                    if label_ids is None:
+                        label_ids = inputs["labels"].detach().cpu().numpy()
+                    else:
+                        label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        else:
+            metrics = {}
+        if len(eval_losses) > 0:
+            metrics["loss"] = np.mean(eval_losses)
+        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
+import dataclasses
+import json
+import logging
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, Tuple
+from .file_utils import cached_property, is_torch_available, torch_required
+if is_torch_available():
+    import torch
+logger = logging.getLogger(__name__)
 @dataclass
@@ -22,6 +34,7 @@ class TrainingArguments:
    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
    evaluate_during_training: bool = field(
        default=False, metadata={"help": "Run evaluation during training at each logging step."}
    )
@@ -44,6 +57,8 @@ class TrainingArguments:
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
+    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
@@ -52,12 +67,6 @@ class TrainingArguments:
            "help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default"
        },
    )
-    eval_all_checkpoints: bool = field(
-        default=False,
-        metadata={
-            "help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
-        },
-    )
    no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"})
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})
@@ -73,3 +82,47 @@ class TrainingArguments:
        },
    )
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
+    @property
+    def train_batch_size(self) -> int:
+        return self.per_gpu_train_batch_size * max(1, self.n_gpu)
+    @property
+    def eval_batch_size(self) -> int:
+        return self.per_gpu_eval_batch_size * max(1, self.n_gpu)
+    @cached_property
+    @torch_required
+    def _setup_devices(self) -> Tuple["torch.device", int]:
+        logger.info("PyTorch: setting up devices")
+        if self.no_cuda:
+            device = torch.device("cpu")
+            n_gpu = 0
+        elif self.local_rank == -1:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            n_gpu = torch.cuda.device_count()
+        else:
+            # Here, we'll use torch.distributed.
+            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+            torch.distributed.init_process_group(backend="nccl")
+            device = torch.device("cuda", self.local_rank)
+            n_gpu = 1
+        return device, n_gpu
+    @property
+    @torch_required
+    def device(self) -> "torch.device":
+        return self._setup_devices[0]
+    @property
+    @torch_required
+    def n_gpu(self):
+        return self._setup_devices[1]
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        return json.dumps(dataclasses.asdict(self), indent=2)
--- a/tests/test_hf_argparser.py
+++ b/tests/test_hf_argparser.py
@@ -5,8 +5,7 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import Optional
-from transformers.hf_argparser import HfArgumentParser
+from transformers import HfArgumentParser, TrainingArguments
-from transformers.training_args import TrainingArguments
 @dataclass