Merge branch 'master' into fix-xlnet-squad2.0

562f8640 · Thomas Wolf · GitHub · ca99a2d5 · 8618bf15 · 562f8640
Unverified Commit 562f8640 authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
20 changed files
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
 # coding: utf8
 def main():
    import sys
-    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
+    if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]:
        print(
-        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
+        "First argument to `transformers` command line interface should be one of: \n"
-        "It should be used as one of: \n"
+        ">> convert serve train predict")
-        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+    if sys.argv[1] == "convert":
-        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        from transformers.commands import convert
-        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        convert(sys.argv)
-        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+    elif sys.argv[1] == "train":
-        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        from transformers.commands import train
-        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        train(sys.argv)
-    else:
+    elif sys.argv[1] == "serve":
-        if sys.argv[1] == "bert":
+        pass
-            try:
+        # from argparse import ArgumentParser
-                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+        # from transformers.commands.serving import ServeCommand
-            except ImportError:
+        # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+        # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-            if len(sys.argv) != 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
-            else:
-                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-                TF_CONFIG = sys.argv.pop()
-                TF_CHECKPOINT = sys.argv.pop()
-                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "gpt":
-            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
-            else:
-                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    OPENAI_GPT_CONFIG = sys.argv[4]
-                else:
-                    OPENAI_GPT_CONFIG = ""
-                convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
-                                                    OPENAI_GPT_CONFIG,
-                                                    PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "transfo_xl":
-            try:
-                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
-            else:
-                if 'ckpt' in sys.argv[2].lower():
-                    TF_CHECKPOINT = sys.argv[2]
-                    TF_DATASET_FILE = ""
-                else:
-                    TF_DATASET_FILE = sys.argv[2]
-                    TF_CHECKPOINT = ""
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    TF_CONFIG = sys.argv[4]
-                else:
-                    TF_CONFIG = ""
-                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
-        elif sys.argv[1] == "gpt2":
-            try:
-                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
-            else:
-                TF_CHECKPOINT = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    TF_CONFIG = sys.argv[4]
-                else:
-                    TF_CONFIG = ""
-                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "xlnet":
-            try:
-                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-            if len(sys.argv) < 5 or len(sys.argv) > 6:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
-            else:
-                TF_CHECKPOINT = sys.argv[2]
-                TF_CONFIG = sys.argv[3]
-                PYTORCH_DUMP_OUTPUT = sys.argv[4]
-                if len(sys.argv) == 6:
-                    FINETUNING_TASK = sys.argv[5]
-                else:
-                    FINETUNING_TASK = None
-                convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
+        # # Register commands
-                                                    TF_CONFIG,
+        # ServeCommand.register_subcommand(commands_parser)
-                                                    PYTORCH_DUMP_OUTPUT,
-                                                    FINETUNING_TASK)
-        elif sys.argv[1] == "xlm":
-            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
-            if len(sys.argv) != 4:
+        # # Let's go
-                # pylint: disable=line-too-long
+        # args = parser.parse_args()
-                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
-            else:
-                XLM_CHECKPOINT_PATH = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
+        # if not hasattr(args, 'func'):
+        #     parser.print_help()
+        #     exit(1)
+        # # Run
+        # service = args.func(args)
+        # service.run()
 if __name__ == '__main__':
    main()
--- a/transformers/commands/__init__.py
+++ b/transformers/commands/__init__.py
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+class BaseTransformersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
+from argparse import ArgumentParser, Namespace
+from logging import getLogger
+from transformers import AutoModel, AutoTokenizer
+from transformers.commands import BaseTransformersCLICommand
+def convert_command_factory(args: Namespace):
+    """
+    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
+    :return: ServeCommand
+    """
+    return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output,
+                          args.config, args.finetuning_task_name)
+class ConvertCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original "
+                                                         "author checkpoints to Transformesr PyTorch checkpoints.")
+        train_parser.add_argument('--model_type', type=str, required=True,
+                                  help='Model\'s type.')
+        train_parser.add_argument('--tf_checkpoint', type=str, required=True,
+                                  help='TensorFlow checkpoint path or folder.')
+        train_parser.add_argument('--pytorch_dump_output', type=str, required=True,
+                                  help='Path to the PyTorch savd model output.')
+        train_parser.add_argument('--config', type=str, default="",
+                                  help='Configuration file path or folder.')
+        train_parser.add_argument('--finetuning_task_name', type=str, default=None,
+                                  help='Optional fine-tuning task name if the TF model was a finetuned model.')
+        train_parser.set_defaults(func=convert_command_factory)
+    def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str,
+                 config: str, finetuning_task_name: str, *args):
+        self._logger = getLogger('transformers-cli/converting')
+        self._logger.info('Loading model {}'.format(model_type))
+        self._model_type = model_type
+        self._tf_checkpoint = tf_checkpoint
+        self._pytorch_dump_output = pytorch_dump_output
+        self._config = config
+        self._finetuning_task_name = finetuning_task_name
+    def run(self):
+        if self._model_type == "bert":
+            try:
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "gpt":
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint,
+                                                    self._config,
+                                                    self._pytorch_dump_output)
+        elif self._model_type == "transfo_xl":
+            try:
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+            if 'ckpt' in self._tf_checkpoint.lower():
+                TF_CHECKPOINT = self._tf_checkpoint
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = self._tf_checkpoint
+                TF_CHECKPOINT = ""
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT,
+                                                        self._config,
+                                                        self._pytorch_dump_output,
+                                                        TF_DATASET_FILE)
+        elif self._model_type == "gpt2":
+            try:
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "xlnet":
+            try:
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+            convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint,
+                                                self._config,
+                                                self._pytorch_dump_output,
+                                                self._finetuning_task_name)
+        elif self._model_type == "xlm":
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        else:
+            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
--- a/transformers/commands/download.py
+++ b/transformers/commands/download.py
+from argparse import ArgumentParser
+from transformers.commands import BaseTransformersCLICommand
+def download_command_factory(args):
+    return DownloadCommand(args.model, args.cache_dir, args.force)
+class DownloadCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser('download')
+        download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models')
+        download_parser.add_argument('--force',  action='store_true', help='Force the model to be download even if already in cache-dir')
+        download_parser.add_argument('model', type=str, help='Name of the model to download')
+        download_parser.set_defaults(func=download_command_factory)
+    def __init__(self, model: str, cache: str, force: bool):
+        self._model = model
+        self._cache = cache
+        self._force = force
+    def run(self):
+        from transformers import AutoModel, AutoTokenizer
+        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
+        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
\ No newline at end of file
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
+import logging
+from argparse import ArgumentParser
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def try_infer_format_from_ext(path: str):
+    if not path:
+        return 'pipe'
+    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
+        if path.endswith(ext):
+            return ext
+    raise Exception(
+        'Unable to determine file format from file extension {}. '
+        'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+    )
+def run_command_factory(args):
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
+    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
+    reader = PipelineDataFormat.from_str(format=format,
+                                         output_path=args.output,
+                                         input_path=args.input,
+                                         column=args.column if args.column else nlp.default_input_names,
+                                         overwrite=args.overwrite)
+    return RunCommand(nlp, reader)
+class RunCommand(BaseTransformersCLICommand):
+    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
+        self._nlp = nlp
+        self._reader = reader
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
+        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
+        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
+        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
+        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
+        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
+        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
+        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
+        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
+        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.')
+        run_parser.set_defaults(func=run_command_factory)
+    def run(self):
+        nlp, outputs = self._nlp, []
+        for entry in self._reader:
+            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
+            if isinstance(output, dict):
+                outputs.append(output)
+            else:
+                outputs += output
+        # Saving data
+        if self._nlp.binary_output:
+            binary_path = self._reader.save_binary(outputs)
+            logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path))
+        else:
+            self._reader.save(outputs)
--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
+from argparse import ArgumentParser, Namespace
+from typing import List, Optional, Union, Any
+import logging
+try:
+    from uvicorn import run
+    from fastapi import FastAPI, HTTPException, Body
+    from pydantic import BaseModel
+    _serve_dependancies_installed = True
+except (ImportError, AttributeError):
+    BaseModel = object
+    Body = lambda *x, **y: None
+    _serve_dependancies_installed = False
+from transformers import Pipeline
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import SUPPORTED_TASKS, pipeline
+logger = logging.getLogger('transformers-cli/serving')
+def serve_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
+    return ServeCommand(nlp, args.host, args.port)
+class ServeModelInfoResult(BaseModel):
+    """
+    Expose model information
+    """
+    infos: dict
+class ServeTokenizeResult(BaseModel):
+    """
+    Tokenize result model
+    """
+    tokens: List[str]
+    tokens_ids: Optional[List[int]]
+class ServeDeTokenizeResult(BaseModel):
+    """
+    DeTokenize result model
+    """
+    text: str
+class ServeForwardResult(BaseModel):
+    """
+    Forward result model
+    """
+    output: Any
+class ServeCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.')
+        serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on')
+        serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.')
+        serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.')
+        serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.')
+        serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.')
+        serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.')
+        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        serve_parser.set_defaults(func=serve_command_factory)
+    def __init__(self, pipeline: Pipeline, host: str, port: int):
+        self._pipeline = pipeline
+        self._host = host
+        self._port = port
+        if not _serve_dependancies_installed:
+            raise ImportError("Using serve command requires FastAPI and unicorn. "
+                                "Please install transformers with [serving]: pip install transformers[serving]." 
+                                "Or install FastAPI and unicorn separatly.")
+        else:
+            logger.info('Serving model over {}:{}'.format(host, port))
+            self._app = FastAPI()
+            # Register routes
+            self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET'])
+            self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST'])
+            self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST'])
+            self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST'])
+    def run(self):
+        run(self._app, host=self._host, port=self._port)
+    def model_info(self):
+        return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
+    def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
+        """
+        Tokenize the provided input and eventually returns corresponding tokens id:
+        - **text_input**: String to tokenize
+        - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
+        """
+        try:
+            tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
+            if return_ids:
+                tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
+                return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
+            else:
+                return ServeTokenizeResult(tokens=tokens_txt)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+    def detokenize(self, tokens_ids: List[int] = Body(None, embed=True),
+                   skip_special_tokens: bool = Body(False, embed=True),
+                   cleanup_tokenization_spaces: bool = Body(True, embed=True)):
+        """
+        Detokenize the provided tokens ids to readable text:
+        - **tokens_ids**: List of tokens ids
+        - **skip_special_tokens**: Flag indicating to not try to decode special tokens
+        - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
+        """
+        try:
+            decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
+            return ServeDeTokenizeResult(model='', text=decoded_str)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+    def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)):
+        """
+        **inputs**:
+        **attention_mask**:
+        **tokens_type_ids**:
+        """
+        # Check we don't have empty string
+        if len(inputs) == 0:
+            return ServeForwardResult(output=[], attention=[])
+        try:
+            # Forward through the model
+            output = self._pipeline(inputs)
+            return ServeForwardResult(output=output)
+        except Exception as e:
+            raise HTTPException(500, {"error": str(e)})
--- a/transformers/commands/train.py
+++ b/transformers/commands/train.py
+import os
+from argparse import ArgumentParser, Namespace
+from logging import getLogger
+from transformers.commands import BaseTransformersCLICommand
+from transformers import (is_tf_available, is_torch_available,
+                          TextClassificationPipeline,
+                          SingleSentenceClassificationProcessor as Processor)
+if not is_tf_available() and not is_torch_available():
+    raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
+# TF training parameters
+USE_XLA = False
+USE_AMP = False
+def train_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    return TrainCommand(args)
+class TrainCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.')
+        train_parser.add_argument('--train_data', type=str, required=True,
+                                  help="path to train (and optionally evaluation) dataset as a csv with "
+                                       "tab separated labels and sentences.")
+        train_parser.add_argument('--column_label', type=int, default=0,
+                                  help='Column of the dataset csv file with example labels.')
+        train_parser.add_argument('--column_text', type=int, default=1,
+                                  help='Column of the dataset csv file with example texts.')
+        train_parser.add_argument('--column_id', type=int, default=2,
+                                  help='Column of the dataset csv file with example ids.')
+        train_parser.add_argument('--skip_first_row', action='store_true',
+                                  help='Skip the first row of the csv file (headers).')
+        train_parser.add_argument('--validation_data', type=str, default='',
+                                  help='path to validation dataset.')
+        train_parser.add_argument('--validation_split', type=float, default=0.1,
+                                  help="if validation dataset is not provided, fraction of train dataset "
+                                       "to use as validation dataset.")
+        train_parser.add_argument('--output', type=str, default='./',
+                                  help='path to saved the trained model.')
+        train_parser.add_argument('--task', type=str, default='text_classification',
+                                  help='Task to train the model on.')
+        train_parser.add_argument('--model', type=str, default='bert-base-uncased',
+                                  help='Model\'s name or path to stored model.')
+        train_parser.add_argument('--train_batch_size', type=int, default=32,
+                                  help='Batch size for training.')
+        train_parser.add_argument('--valid_batch_size', type=int, default=64,
+                                  help='Batch size for validation.')
+        train_parser.add_argument('--learning_rate', type=float, default=3e-5,
+                                  help="Learning rate.")
+        train_parser.add_argument('--adam_epsilon', type=float, default=1e-08,
+                                  help="Epsilon for Adam optimizer.")
+        train_parser.set_defaults(func=train_command_factory)
+    def __init__(self, args: Namespace):
+        self.logger = getLogger('transformers-cli/training')
+        self.framework = 'tf' if is_tf_available() else 'torch'
+        os.makedirs(args.output, exist_ok=True)
+        assert os.path.isdir(args.output)
+        self.output = args.output
+        self.column_label = args.column_label
+        self.column_text = args.column_text
+        self.column_id = args.column_id
+        self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model))
+        if args.task == 'text_classification':
+            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
+        elif args.task == 'token_classification':
+            raise NotImplementedError
+        elif args.task == 'question_answering':
+            raise NotImplementedError
+        self.logger.info('Loading dataset from {}'.format(args.train_data))
+        self.train_dataset = Processor.create_from_csv(args.train_data,
+                                                       column_label=args.column_label,
+                                                       column_text=args.column_text,
+                                                       column_id=args.column_id,
+                                                       skip_first_row=args.skip_first_row)
+        self.valid_dataset = None
+        if args.validation_data:
+            self.logger.info('Loading validation dataset from {}'.format(args.validation_data))
+            self.valid_dataset = Processor.create_from_csv(args.validation_data,
+                                                           column_label=args.column_label,
+                                                           column_text=args.column_text,
+                                                           column_id=args.column_id,
+                                                           skip_first_row=args.skip_first_row)
+        self.validation_split = args.validation_split
+        self.train_batch_size = args.train_batch_size
+        self.valid_batch_size = args.valid_batch_size
+        self.learning_rate = args.learning_rate
+        self.adam_epsilon = args.adam_epsilon
+    def run(self):
+        if self.framework == 'tf':
+            return self.run_tf()
+        return self.run_torch()
+    def run_torch(self):
+        raise NotImplementedError
+    def run_tf(self):
+        self.pipeline.fit(self.train_dataset,
+                          validation_data=self.valid_dataset,
+                          validation_split=self.validation_split,
+                          learning_rate=self.learning_rate,
+                          adam_epsilon=self.adam_epsilon,
+                          train_batch_size=self.train_batch_size,
+                          valid_batch_size=self.valid_batch_size)
+        # Save trained pipeline
+        self.pipeline.save_pretrained(self.output)
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
+from argparse import ArgumentParser
+from getpass import getpass
+import os
+from transformers.commands import BaseTransformersCLICommand
+from transformers.hf_api import HfApi, HfFolder, HTTPError
+class UserCommands(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        login_parser = parser.add_parser('login')
+        login_parser.set_defaults(func=lambda args: LoginCommand(args))
+        whoami_parser = parser.add_parser('whoami')
+        whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
+        logout_parser = parser.add_parser('logout')
+        logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
+        list_parser = parser.add_parser('ls')
+        list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
+        # upload
+        upload_parser = parser.add_parser('upload')
+        upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
+        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
+        upload_parser.set_defaults(func=lambda args: UploadCommand(args))
+class ANSI:
+    """
+    Helper for en.wikipedia.org/wiki/ANSI_escape_code
+    """
+    _bold = u"\u001b[1m"
+    _reset = u"\u001b[0m"
+    @classmethod
+    def bold(cls, s):
+        return "{}{}{}".format(cls._bold, s, cls._reset)
+class BaseUserCommand:
+    def __init__(self, args):
+        self.args = args
+        self._api = HfApi()
+class LoginCommand(BaseUserCommand):
+    def run(self):
+        print("""
+        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
+        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
+        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
+        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
+        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
+        """)
+        username = input("Username: ")
+        password = getpass()
+        try:
+            token = self._api.login(username, password)
+        except HTTPError as e:
+            # probably invalid credentials, display error message.
+            print(e)
+            exit(1)
+        HfFolder.save_token(token)
+        print("Login successful")
+        print("Your token:", token, "\n")
+        print("Your token has been saved to", HfFolder.path_token)
+class WhoamiCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        try:
+            user = self._api.whoami(token)
+            print(user)
+        except HTTPError as e:
+            print(e)
+class LogoutCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        HfFolder.delete_token()
+        self._api.logout(token)
+        print("Successfully logged out.")
+class ListObjsCommand(BaseUserCommand):
+    def tabulate(self, rows, headers):
+        # type: (List[List[Union[str, int]]], List[str]) -> str
+        """
+        Inspired by:
+        stackoverflow.com/a/8356620/593036
+        stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
+        """
+        col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
+        row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
+        lines = []
+        lines.append(
+            row_format.format(*headers)
+        )
+        lines.append(
+            row_format.format(*["-" * w for w in col_widths])
+        )
+        for row in rows:
+            lines.append(
+                row_format.format(*row)
+            )
+        return "\n".join(lines)
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        try:
+            objs = self._api.list_objs(token)
+        except HTTPError as e:
+            print(e)
+            exit(1)
+        if len(objs) == 0:
+            print("No shared file yet")
+            exit()
+        rows = [ [
+            obj.filename,
+            obj.LastModified,
+            obj.ETag,
+            obj.Size
+        ] for obj in objs ]
+        print(
+            self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
+        )
+class UploadCommand(BaseUserCommand):
+    def walk_dir(self, rel_path):
+        """
+        Recursively list all files in a folder.
+        """
+        entries: List[os.DirEntry] = list(os.scandir(rel_path))
+        files = [
+            (
+                os.path.join(os.getcwd(), f.path),  # filepath
+                f.path  # filename
+            )
+            for f in entries if f.is_file()
+        ]
+        for f in entries:
+            if f.is_dir():
+                files += self.walk_dir(f.path)
+        return files
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        local_path = os.path.abspath(self.args.path)
+        if os.path.isdir(local_path):
+            if self.args.filename is not None:
+                raise ValueError("Cannot specify a filename override when uploading a folder.")
+            rel_path = os.path.basename(local_path)
+            files = self.walk_dir(rel_path)
+        elif os.path.isfile(local_path):
+            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
+            files = [(local_path, filename)]
+        else:
+            raise ValueError("Not a valid file or directory: {}".format(local_path))
+        for filepath, filename in files:
+            print(
+                "About to upload file {} to S3 under filename {}".format(
+                    ANSI.bold(filepath), ANSI.bold(filename)
+                )
+            )
+        choice = input("Proceed? [Y/n] ").lower()
+        if not(choice == "" or choice == "y" or choice == "yes"):
+            print("Abort")
+            exit()
+        print(
+            ANSI.bold("Uploading... This might take a while if files are large")
+        )
+        for filepath, filename in files:
+            access_url = self._api.presign_and_upload(
+                token=token, filename=filename, filepath=filepath
+            )
+            print("Your file now lives at:")
+            print(access_url)
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ALBERT model configuration """
+from .configuration_utils import PretrainedConfig
+ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
+    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
+    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
+    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
+    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
+    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
+    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
+    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
+}
+class AlbertConfig(PretrainedConfig):
+    """Configuration for `AlbertModel`.
+    The default settings match the configuration of model `albert_xxlarge`.
+    """
+    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+    def __init__(self,
+                 vocab_size=30000,
+                 embedding_size=128,
+                 hidden_size=4096,
+                 num_hidden_layers=12,
+                 num_hidden_groups=1,
+                 num_attention_heads=64,
+                 intermediate_size=16384,
+                 inner_group_num=1,
+                 hidden_act="gelu_new",
+                 hidden_dropout_prob=0,
+                 attention_probs_dropout_prob=0,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12, **kwargs):
+        """Constructs AlbertConfig.
+        Args:
+            vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
+            embedding_size: size of voc embeddings.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_hidden_groups: Number of group for the hidden layers, parameters in
+                the same group are shared.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            inner_group_num: int, number of inner repetition of attention and ffn.
+            down_scale_factor: float, the scale to apply
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler.
+            hidden_dropout_prob: The dropout probability for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `AlbertModel`.
+            initializer_range: The stdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(AlbertConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_hidden_groups = num_hidden_groups
+        self.num_attention_heads = num_attention_heads
+        self.inner_group_num = inner_group_num
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -18,19 +18,42 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
-from .configuration_bert import BertConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 logger = logging.getLogger(__name__)
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
 class AutoConfig(object):
    r""":class:`~transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
@@ -43,13 +66,16 @@ class AutoConfig(object):
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `albert`: AlbertConfig (ALBERT model)
+            - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
            - contains `xlnet`: XLNetConfig (XLNet model)
            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `ctrl` : CTRLConfig (CTRL model)
        This class cannot be instantiated using `__init__()` (throw an error).
    """
@@ -57,6 +83,34 @@ class AutoConfig(object):
        raise EnvironmentError("AutoConfig is designed to be instantiated "
            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def for_model(cls, model_type, *args, **kwargs):
+        if 'distilbert' in model_type:
+            return DistilBertConfig(*args, **kwargs)
+        elif 'roberta' in model_type:
+            return RobertaConfig(*args, **kwargs)
+        elif 'bert' in model_type:
+            return BertConfig(*args, **kwargs)
+        elif 'openai-gpt' in model_type:
+            return OpenAIGPTConfig(*args, **kwargs)
+        elif 'gpt2' in model_type:
+            return GPT2Config(*args, **kwargs)
+        elif 'transfo-xl' in model_type:
+            return TransfoXLConfig(*args, **kwargs)
+        elif 'xlnet' in model_type:
+            return XLNetConfig(*args, **kwargs)
+        elif 'xlm' in model_type:
+            return XLMConfig(*args, **kwargs)
+        elif 'ctrl' in model_type:
+            return CTRLConfig(*args, **kwargs)
+        elif 'albert' in model_type:
+            return AlbertConfig(*args, **kwargs)
+        elif 'camembert' in model_type:
+            return CamembertConfig(*args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r""" Instantiate a one of the configuration classes of the library
@@ -64,19 +118,24 @@ class AutoConfig(object):
        The configuration class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Config (T5 model)
            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `albert`: AlbertConfig (ALBERT model)
+            - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
            - contains `xlnet`: XLNetConfig (XLNet model)
            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `ctrl` : CTRLConfig (CTRL model)
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
@@ -92,6 +151,9 @@ class AutoConfig(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -114,8 +176,16 @@ class AutoConfig(object):
            assert unused_kwargs == {'foo': False}
        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'camembert' in pretrained_model_name_or_path:
+            return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -134,4 +204,4 @@ class AutoConfig(object):
            return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
 }
@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig):
        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig):
    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
@@ -91,14 +97,7 @@ class BertConfig(PretrainedConfig):
                 layer_norm_eps=1e-12,
                 **kwargs):
        super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.vocab_size = vocab_size
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
@@ -110,6 +109,3 @@ class BertConfig(PretrainedConfig):
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CamemBERT configuration """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+from .configuration_roberta import RobertaConfig
+logger = logging.getLogger(__name__)
+CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
+}
+class CamembertConfig(RobertaConfig):
+    pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `CTRLModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        dff: Size of the inner dimension of the FFN.
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=246534,
+        vocab_size=246534,
        n_positions=256,
        n_ctx=256,
        n_embd=1280,
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-6,
        initializer_range=0.02,
-        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
        """Constructs CTRLConfig.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            dff: Size of the inner dimension of the FFN.
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
                initializing all weight matrices.
        """
        super(CTRLConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
-        self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
    @property
    def max_position_embeddings(self):

--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
 DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
+    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
+    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
 }
@@ -35,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                 max_position_embeddings=512,
                 sinusoidal_pos_embds=False,
                 n_layers=6,
@@ -51,15 +53,7 @@ class DistilBertConfig(PretrainedConfig):
                 seq_classif_dropout=0.2,
                 **kwargs):
        super(DistilBertConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
        self.max_position_embeddings = max_position_embeddings
        self.sinusoidal_pos_embds = sinusoidal_pos_embds
        self.n_layers = n_layers
@@ -73,9 +67,7 @@ class DistilBertConfig(PretrainedConfig):
        self.tie_weights_ = tie_weights_
        self.qa_dropout = qa_dropout
        self.seq_classif_dropout = seq_classif_dropout
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
    @property
    def hidden_size(self):
        return self.dim

--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
    """Configuration class to store the configuration of a `GPT2Model`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=50257,
+        vocab_size=50257,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
-        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
        """Constructs GPT2Config.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            n_embd: Dimensionality of the embeddings and hidden states.
@@ -96,15 +94,7 @@ class GPT2Config(PretrainedConfig):
                initializing all weight matrices.
        """
        super(GPT2Config, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -115,18 +105,11 @@ class GPT2Config(PretrainedConfig):
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
-            self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
    @property
    def max_position_embeddings(self):

--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
    Configuration class to store the configuration of a `OpenAIGPTModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=40478,
+        vocab_size=40478,
        n_positions=512,
        n_ctx=512,
        n_embd=768,
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        predict_special_tokens=True,
-        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -83,15 +81,7 @@ class OpenAIGPTConfig(PretrainedConfig):
        """Constructs OpenAIGPTConfig.
        """
        super(OpenAIGPTConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -104,18 +94,11 @@ class OpenAIGPTConfig(PretrainedConfig):
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.predict_special_tokens = predict_special_tokens
-            self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
    @property
    def max_position_embeddings(self):

--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
+# coding=utf-8
+# Copyright 2010, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import sys
+import six
+from io import open
+from .configuration_utils import PretrainedConfig
+logger = logging.getLogger(__name__)
+T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
+}
+class T5Config(PretrainedConfig):
+    r"""
+        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+        `T5Model`.
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `T5Model`.
+            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+    def __init__(self,
+                 vocab_size=32128,
+                 n_positions=512,
+                 d_model=512,
+                 d_kv=64,
+                 d_ff=2048,
+                 num_layers=6,
+                 num_heads=8,
+                 relative_attention_num_buckets=32,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
+                 initializer_factor=1.0,
+                 **kwargs):
+        super(T5Config, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+    @property
+    def hidden_size(self):
+        return self.d_model
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `TransfoXLModel`.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
            cutoffs: cutoffs for the adaptive softmax
            d_model: Dimensionality of the model's hidden states.
            d_embed: Dimensionality of the embeddings
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=267735,
+                 vocab_size=267735,
                 cutoffs=[20000, 40000, 200000],
                 d_model=1024,
                 d_embed=1024,
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
        """Constructs TransfoXLConfig.
        """
        super(TransfoXLConfig, self).__init__(**kwargs)
-        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
        self.cutoffs = []
        self.cutoffs.extend(cutoffs)
        self.tie_weight = tie_weight
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
        self.init_std = init_std
        self.layer_norm_epsilon = layer_norm_epsilon
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
    @property
    def max_position_embeddings(self):
        return self.tgt_len + self.ext_len + self.mem_len
    @property
-    def vocab_size(self):
+    def n_token(self):  # Backward compatibility
-        return self.n_token
+        return self.vocab_size
-    @vocab_size.setter
+    @n_token.setter
-    def vocab_size(self, value):
+    def n_token(self, value):  # Backward compatibility
-        self.n_token = value
+        self.vocab_size = value
    @property
    def hidden_size(self):

--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -24,7 +24,7 @@ import logging
 import os
 from io import open
-from .file_utils import cached_path, CONFIG_NAME
+from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 logger = logging.getLogger(__name__)
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
    pretrained_config_archive_map = {}
    def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        # Attributes with defaults
-        self.num_labels = kwargs.pop('num_labels', 2)
        self.output_attentions = kwargs.pop('output_attentions', False)
        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
        self.output_past = kwargs.pop('output_past', True)  # Not used by all models
@@ -59,6 +58,22 @@ class PretrainedConfig(object):
        self.pruned_heads = kwargs.pop('pruned_heads', {})
        self.is_decoder = kwargs.pop('is_decoder', False)
+        # Fine-tuning task arguments
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
    def save_pretrained(self, save_directory):
        """ Save a configuration object to the directory `save_directory`, so that it
            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
@@ -79,6 +94,7 @@ class PretrainedConfig(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
@@ -94,6 +110,9 @@ class PretrainedConfig(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -120,6 +139,7 @@ class PretrainedConfig(object):
        """
        cache_dir = kwargs.pop('cache_dir', None)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
@@ -127,11 +147,18 @@ class PretrainedConfig(object):
            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
        elif os.path.isdir(pretrained_model_name_or_path):
            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        else:
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            config_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            # Load from URL or cache if already cached
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
+                                               proxies=proxies, resume_download=resume_download)
+            # Load config
+            config = cls.from_json_file(resolved_config_file)
        except EnvironmentError:
            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
@@ -145,15 +172,18 @@ class PretrainedConfig(object):
                        config_file, CONFIG_NAME)
            raise EnvironmentError(msg)
+        except json.JSONDecodeError:
+            msg = "Couldn't reach server at '{}' to download configuration file or " \
+                  "configuration file is not a valid JSON file. " \
+                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            raise EnvironmentError(msg)
        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:
            logger.info("loading configuration file {} from cache at {}".format(
                config_file, resolved_config_file))
-        # Load config
-        config = cls.from_json_file(resolved_config_file)
        if hasattr(config, 'pruned_heads'):
            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
@@ -175,17 +205,15 @@ class PretrainedConfig(object):
    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
+        return cls(**json_object)
-        for key, value in json_object.items():
-            setattr(config, key, value)
-        return config
    @classmethod
    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
+        """Constructs a `Config` from a json file of parameters."""
        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
-        return cls.from_dict(json.loads(text))
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
    def __eq__(self, other):
        return self.__dict__ == other.__dict__

--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `XLMModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
        d_model: Size of the encoder layers and the pooler layer.
        n_layer: Number of hidden layers in the Transformer encoder.
        n_head: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30145,
+                 vocab_size=30145,
                 emb_dim=2048,
                 n_layers=12,
                 n_heads=16,
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
                 unk_index=3,
                 mask_index=5,
                 is_encoder=True,
-                 finetuning_task=None,
-                 num_labels=2,
                 summary_type='first',
                 summary_use_proj=True,
                 summary_activation=None,
@@ -117,15 +114,7 @@ class XLMConfig(PretrainedConfig):
        """Constructs XLMConfig.
        """
        super(XLMConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_words = vocab_size_or_config_json_file
        self.emb_dim = emb_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
@@ -147,8 +136,6 @@ class XLMConfig(PretrainedConfig):
        self.max_position_embeddings = max_position_embeddings
        self.embed_init_std = embed_init_std
        self.init_std = init_std
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
@@ -156,17 +143,17 @@ class XLMConfig(PretrainedConfig):
        self.summary_first_dropout = summary_first_dropout
        self.start_n_top = start_n_top
        self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
+        if "n_words" in kwargs:
-                             " or the path to a pretrained model config file (str)")
+            self.n_words = kwargs["n_words"]
    @property
-    def vocab_size(self):
+    def n_words(self):  # For backward compatibility
-        return self.n_words
+        return self.vocab_size
-    @vocab_size.setter
+    @n_words.setter
-    def vocab_size(self, value):
+    def n_words(self, value):  # For backward compatibility
-        self.n_words = value
+        self.vocab_size = value
    @property
    def hidden_size(self):