Merge branch 'master' into add_models_special_tokens_to_specific_configs

146c5212 · Lysandre Debut · GitHub · f5b50c6b · b623ddc0 · 146c5212
Unverified Commit 146c5212 authored Mar 05, 2020 by Lysandre Debut Committed by GitHub Mar 05, 2020
20 changed files
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -935,7 +935,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_loss
-    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs):
        # Add dummy token at the end (no attention on this one)
        effective_batch_size = input_ids.shape[0]
@@ -958,8 +958,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
        # if past is defined in model kwargs then use it for faster decoding
-        if "past" in model_kwargs and model_kwargs["past"]:
+        if past:
-            inputs["mems"] = model_kwargs["past"]
+            inputs["mems"] = past
        return inputs
@@ -1264,8 +1264,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_logits = logits.view(-1, self.num_labels)
-                active_labels = labels.view(-1)[active_loss]
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_bart import BartConfig
 from .configuration_distilbert import DistilBertConfig
 from .configuration_roberta import RobertaConfig
 from .configuration_utils import PretrainedConfig
@@ -279,6 +280,9 @@ class _ScikitCompat(ABC):
 class Pipeline(_ScikitCompat):
    """
+    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+    different pipelines.
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
@@ -292,39 +296,49 @@ class Pipeline(_ScikitCompat):
    pickle format.
    Arguments:
-        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
-            Reference to the model to use through this pipeline.
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
-        **tokenizer**: ``(str, PreTrainedTokenizer)``:
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
-            Reference to the tokenizer to use through this pipeline.
+            TensorFlow.
-        **args_parser**: ``ArgumentHandler``:
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
-        **device**: ``int``:
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
+        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
-        **binary_output** ``bool`` (default: False):
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
    Return:
+        :obj:`List` or :obj:`Dict`:
        Pipeline returns list or dictionary depending on:
-         - Does the user provided multiple sample
-         - The pipeline expose multiple fields in the output object
+         - Whether the user supplied multiple samples
+         - Whether the pipeline exposes multiple fields in the output object
-    Examples:
-        nlp = pipeline('ner')
-        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
-        nlp = NerPipeline(model='...', config='...', tokenizer='...')
-        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
    """
    default_input_names = None
+    task = None
    def __init__(
        self,
-        model,
+        model: Optional = None,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
@@ -336,6 +350,8 @@ class Pipeline(_ScikitCompat):
        if framework is None:
            framework = get_framework()
+        model, tokenizer = self.get_defaults(model, tokenizer, framework)
        self.model = model
        self.tokenizer = tokenizer
        self.modelcard = modelcard
@@ -412,7 +428,7 @@ class Pipeline(_ScikitCompat):
        """
        args = ["input_ids", "attention_mask"]
-        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)):
+        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig)):
            args += ["token_type_ids"]
        # PR #1548 (CLI) There is an issue with attention_mask
@@ -467,15 +483,74 @@ class Pipeline(_ScikitCompat):
        else:
            return predictions.numpy()
+    def get_defaults(self, model, tokenizer, framework):
+        task_defaults = SUPPORTED_TASKS[self.task]
+        if model is None:
+            if framework == "tf":
+                model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
+            elif framework == "pt":
+                model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
+            else:
+                raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")
+        if tokenizer is None:
+            default_tokenizer = task_defaults["default"]["tokenizer"]
+            if isinstance(default_tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)
+        return model, tokenizer
 class FeatureExtractionPipeline(Pipeline):
    """
-    Feature extraction pipeline using Model head.
+    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
+    which can be used as features in a downstream tasks.
+    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+    - "feature-extraction", for extracting features of a sequence.
+    All models may be used for this pipeline. See a list of all models, including community-contributed models on
+    `huggingface.co/models <https://huggingface.co/models>`__.
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
    """
+    task = "feature-extraction"
    def __init__(
        self,
-        model,
+        model: Optional = None,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
@@ -498,9 +573,49 @@ class FeatureExtractionPipeline(Pipeline):
 class TextClassificationPipeline(Pipeline):
    """
-    Text classification pipeline using ModelForTextClassification head.
+    Text classification pipeline using ModelForSequenceClassification head. See the
+    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.
+    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=text-classification>`__.
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
    """
+    task = "sentiment-analysis"
    def __call__(self, *args, **kwargs):
        outputs = super().__call__(*args, **kwargs)
        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
@@ -509,12 +624,53 @@ class TextClassificationPipeline(Pipeline):
 class FillMaskPipeline(Pipeline):
    """
-    Masked language modeling prediction pipeline using ModelWithLMHead head.
+    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
+    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.
+    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+    - "fill-mask", for predicting masked tokens in a sequence.
+    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+    which includes the bi-directional models in the library.
+    See the list of available community models on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
    """
+    task = "fill-mask"
    def __init__(
        self,
-        model,
+        model: Optional = None,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
@@ -574,14 +730,57 @@ class FillMaskPipeline(Pipeline):
 class NerPipeline(Pipeline):
    """
-    Named Entity Recognition pipeline using ModelForTokenClassification head.
+    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
+    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.
+    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=token-classification>`__.
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
+    Example::
+        from transformers import pi
    """
    default_input_names = "sequences"
+    task = "ner"
    def __init__(
        self,
-        model,
+        model: Optional = None,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
@@ -636,7 +835,7 @@ class NerPipeline(Pipeline):
                if self.model.config.id2label[label_idx] not in self.ignore_labels:
                    answer += [
                        {
-                            "word": self.tokenizer.decode([int(input_ids[idx])]),
+                            "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                            "score": score[idx][label_idx].item(),
                            "entity": self.model.config.id2label[label_idx],
                        }
@@ -716,15 +915,54 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 class QuestionAnsweringPipeline(Pipeline):
    """
-    Question Answering pipeline using ModelForQuestionAnswering head.
+    Question Answering pipeline using ModelForQuestionAnswering head. See the
+    `question answering usage <../usage.html#question-answering>`__ examples for more information.
+    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+    - "question-answering", for answering questions given a context.
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=question-answering>`__.
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
    """
    default_input_names = "question,context"
+    task = "question-answering"
    def __init__(
        self,
-        model,
+        model: Optional = None,
-        tokenizer: Optional[PreTrainedTokenizer],
+        tokenizer: Optional[PreTrainedTokenizer] = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
        device: int = -1,
@@ -1003,23 +1241,77 @@ def pipeline(
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-    modelcard: Optional[Union[str, ModelCard]] = None,
    framework: Optional[str] = None,
    **kwargs
 ) -> Pipeline:
    """
    Utility factory method to build a pipeline.
    Pipeline are made of:
-        A Tokenizer instance in charge of mapping raw textual input to token
-        A Model instance
-        Some (optional) post processing for enhancing model's output
-    Examples:
+        - A Tokenizer instance in charge of mapping raw textual input to token
+        - A Model instance
+        - Some (optional) post processing for enhancing model's output
+    Args:
+        task (:obj:`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
+            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
+            - "ner": will return a :class:`~transformers.NerPipeline`
+            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
+            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
+            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained model configuration inheriting from
+            :class:`~transformers.PretrainedConfig`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+    Returns:
+        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
+        the task.
+    Examples::
+        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+        # Sentiment analysis pipeline
        pipeline('sentiment-analysis')
+        # Question answering pipeline, specifying the checkpoint identifier
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
-        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
-        pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased')
+        # Named entity recognition pipeline, passing in a specific model and tokenizer
-        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
+        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        pipeline('ner', model=model, tokenizer=tokenizer)
+        # Named entity recognition pipeline, passing a model and configuration with a HTTPS URL.
+        model_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/pytorch_model.bin"
+        config_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/config.json"
+        pipeline('ner', model=model_url, config=config_url, tokenizer='bert-base-cased')
    """
    # Retrieve the task
    if task not in SUPPORTED_TASKS:
@@ -1048,13 +1340,12 @@ def pipeline(
                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
            )
+    modelcard = None
    # Try to infer modelcard from model or config name (if provided as str)
-    if modelcard is None:
+    if isinstance(model, str):
-        # Try to fallback on one of the provided string for model or config (will replace the suffix)
+        modelcard = model
-        if isinstance(model, str):
+    elif isinstance(config, str):
-            modelcard = model
+        modelcard = config
-        elif isinstance(config, str):
-            modelcard = config
    # Instantiate tokenizer if needed
    if isinstance(tokenizer, (str, tuple)):

--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -19,6 +19,7 @@ import logging
 import os
 import unicodedata
 from shutil import copyfile
+from typing import List, Optional
 from .tokenization_utils import PreTrainedTokenizer
@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁"
 class AlbertTokenizer(PreTrainedTokenizer):
    """
-        SentencePiece based tokenizer. Peculiarities:
+    Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The end of sequence token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer):
        return self.sp_model.IdToPiece(index)
    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        An ALBERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer):
            return cls + token_ids_0 + sep
        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+        """
-            to a directory.
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -19,11 +19,7 @@ from .tokenization_roberta import RobertaTokenizer
 # vocab and merges same as roberta
 vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
 merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
-_all_bart_models = [
+_all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
-    "bart-large",
-    "bart-large-mnli",
-    # "bart-large-cnn"
-]
 class BartTokenizer(RobertaTokenizer):

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -19,6 +19,7 @@ import collections
 import logging
 import os
 import unicodedata
+from typing import List, Optional
 from tokenizers import BertWordPieceTokenizer
@@ -117,17 +118,41 @@ def whitespace_tokenize(text):
 class BertTokenizer(PreTrainedTokenizer):
    r"""
-    Constructs a BertTokenizer.
+    Constructs a BERT tokenizer. Based on WordPiece.
-    :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        vocab_file (:obj:`string`):
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
+            File containing the vocabulary.
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            Whether to lowercase the input when tokenizing.
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            Whether to do basic tokenization before WordPiece.
-            do_basic_tokenize=True
+        never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            List of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/transformers/issues/328
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer):
        tokenize_chinese_chars=True,
        **kwargs
    ):
-        """Constructs a BertTokenizer.
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-            **do_basic_tokenize**: (`optional`) boolean (default True)
-                Whether to do basic tokenization before wordpiece.
-            **never_split**: (`optional`) list of string
-                List of tokens which will never be split during tokenization.
-                Only has an effect when do_basic_tokenize=True
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer):
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A BERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer):
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A BERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer):
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        index = 0
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])

--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -18,6 +18,7 @@
 import logging
 import os
 from shutil import copyfile
+from typing import List, Optional
 import sentencepiece as spm
@@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
        Adapted from RobertaTokenizer and XLNetTokenizer
        SentencePiece based tokenizer. Peculiarities:
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
+        A CamemBERT sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A RoBERTa sequence pair mask has the following format:
+        A CamemBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence  | | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
        return out_string
    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+        """
-            to a directory.
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -116,8 +116,21 @@ def get_pairs(word):
 class CTRLTokenizer(PreTrainedTokenizer):
    """
-    CTRL BPE tokenizer. Peculiarities:
+    Constructs a CTRL tokenizer. Peculiarities:
-        - Byte-Pair-Encoding
+    - Byte-Pair-Encoding
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
        return out_string
    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
 class DistilBertTokenizer(BertTokenizer):
    r"""
    Constructs a DistilBertTokenizer.
-    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
+    parameters.
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_basic_tokenize=True
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/tokenization_flaubert.py
+++ b/src/transformers/tokenization_flaubert.py
@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer):
    """
    BPE tokenizer for Flaubert
-        - Moses preprocessing & tokenization
+    - Moses preprocessing & tokenization
+    - Normalize all inputs text
-        - Normalize all inputs text
+    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+      (ex: "__classify__") to a vocabulary
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+    - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
-        (ex: "__classify__") to a vocabulary
+    This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
-        - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
+    and documentation regarding arguments.
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -101,11 +101,35 @@ def get_pairs(word):
 class GPT2Tokenizer(PreTrainedTokenizer):
    """
    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding and tokenize methods should be called with the
+    - Byte-level Byte-Pair-Encoding
-          ``add_prefix_space`` flag set to ``True``.
+    - Requires a space to start the input string => the encoding methods should be called with the
-          Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
+      ``add_prefix_space`` flag set to ``True``.
-          the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
+      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+      the absence of a space at the beginning of a string:
+    ::
+        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to "replace"):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The end of sequence token.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        return text
    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -82,8 +82,21 @@ def text_standardize(text):
 class OpenAIGPTTokenizer(PreTrainedTokenizer):
    """
    BPE tokenizer. Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+    - lower case all inputs
+    - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
        return out_string
    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -16,6 +16,7 @@
 import logging
+from typing import List, Optional
 from tokenizers.processors import RobertaProcessing
@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class RobertaTokenizer(GPT2Tokenizer):
    """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+    Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding methods should be called with the
+    - Byte-level Byte-Pair-Encoding
-          ``add_prefix_space`` flag set to ``True``.
+    - Requires a space to start the input string => the encoding methods should be called with the
-          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+      ``add_prefix_space`` flag set to ``True``.
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
+      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+      the absence of a space at the beginning of a string:
+    ::
+        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to "replace"):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
 class TransfoXLTokenizer(PreTrainedTokenizer):
    """
    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
            raise ValueError("No <unkown> token in vocabulary")
    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        logger.warning(
            "Please note you will not be able to load the save vocabulary in"

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -1012,6 +1012,12 @@ class PreTrainedTokenizer(object):
                "https://github.com/huggingface/transformers/pull/2674"
            )
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
        first_ids = get_input_ids(text)
        second_ids = get_input_ids(text_pair) if text_pair is not None else None
@@ -1115,6 +1121,12 @@ class PreTrainedTokenizer(object):
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                )
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers."
@@ -1126,8 +1138,7 @@ class PreTrainedTokenizer(object):
        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
-            if isinstance(ids_or_pair_ids, (list, tuple)):
+            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
-                assert len(ids_or_pair_ids) == 2
                ids, pair_ids = ids_or_pair_ids
            else:
                ids, pair_ids = ids_or_pair_ids, None
@@ -1789,7 +1800,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
        # Throw an error if we can pad because there is no padding token
        if pad_to_max_length and self.pad_token_id is None:
-            raise ValueError("Unable to set proper padding strategy as the tokenizer does have padding token")
+            raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token")
        # Set the truncation and padding strategy and restore the initial configuration
        with truncate_and_pad(

--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -21,6 +21,7 @@ import os
 import re
 import sys
 import unicodedata
+from typing import List, Optional
 import sacremoses as sm
@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer):
    """
    BPE tokenizer for XLM
-        - Moses preprocessing & tokenization for most supported languages
+    - Moses preprocessing & tokenization for most supported languages
+    - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
-        - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
+    - (optionally) lower case & normalize all inputs text
+    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        - (optionally) lower case & normalize all inputs text
+      (ex: "__classify__") to a vocabulary
+    - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+    - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
-        (ex: "__classify__") to a vocabulary
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
+    should refer to the superclass for more information regarding methods.
-        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
+    Args:
+        vocab_file (:obj:`string`):
-        - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
+            Vocabulary file.
+        merges_file (:obj:`string`):
+            Merges file.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+            List of additional special tokens.
+        lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
+            Dictionary mapping language IDs to their string identifiers.
+        do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase and remove accents when tokenizing.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
        out_string = "".join(tokens).replace("</w>", " ").strip()
        return out_string
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A XLM sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s> B </s>
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s> B </s>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
        cls = [self.cls_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        An XLM sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -18,6 +18,7 @@
 import logging
 import os
 from shutil import copyfile
+from typing import List, Optional
 from transformers.tokenization_utils import PreTrainedTokenizer
@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        Adapted from RobertaTokenizer and XLNetTokenizer
        SentencePiece based tokenizer. Peculiarities:
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
+        A XLM-R sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+        XLM-R does not make use of token type ids, therefore a list of zeros is returned.
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        return out_string
    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+        """
-            to a directory.
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -19,6 +19,7 @@ import logging
 import os
 import unicodedata
 from shutil import copyfile
+from typing import List, Optional
 from .tokenization_utils import PreTrainedTokenizer
@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
 class XLNetTokenizer(PreTrainedTokenizer):
    """
-        SentencePiece based tokenizer. Peculiarities:
+    Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+            .. note::
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            Additional special tokens used by the tokenizer.
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        An XLNet sequence has the following format:
-            single sequence: X <sep> <cls>
-            pair of sequences: A <sep> B <sep> <cls>
+        - single sequence: ``X <sep> <cls>``
+        - pair of sequences: ``A <sep> B <sep> <cls>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
            return token_ids_0 + sep + cls
        return token_ids_0 + sep + token_ids_1 + sep + cls
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
-            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_0 (:obj:`List[int]`):
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                List of ids.
-                for sequence pairs
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                Optional second list of IDs for sequence pairs.
-                special tokens for the model
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
        if already_has_special_tokens:
@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
        return ([0] * len(token_ids_0)) + [1, 1]
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        An XLNet sequence pair mask has the following format:
@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
        | first sequence    | second sequence     | CLS segment ID
        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
        """
        sep = [self.sep_token_id]
        cls_segment_id = [2]
@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+        """
-            to a directory.
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -594,7 +594,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -57,8 +57,18 @@ class ConfigTester(object):
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+    def create_and_test_config_with_num_labels(self):
+        config = self.config_class(**self.inputs_dict, num_labels=5)
+        self.parent.assertEqual(len(config.id2label), 5)
+        self.parent.assertEqual(len(config.label2id), 5)
+        config.num_labels = 3
+        self.parent.assertEqual(len(config.id2label), 3)
+        self.parent.assertEqual(len(config.label2id), 3)
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
        self.create_and_test_config_from_and_save_pretrained()
+        self.create_and_test_config_with_num_labels()
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase):
        for file in files:
            # Open all files
+            print("Testing", file, end=" ")
            with open(os.path.join(directory, file)) as f:
                # Retrieve examples
                examples = get_examples_from_file(f)
@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase):
                            joined_examples.append(example)
                            joined_examples_index += 1
-                print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
+                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
                # Execute sub tests with every example.
                for index, code_example in enumerate(joined_examples):
@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase):
    def test_main_doc_examples(self):
        doc_directory = "docs/source"
-        self.analyze_directory(doc_directory)
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_directory, ignore_files=ignore_files)
    def test_modeling_examples(self):
        transformers_directory = "src/transformers"
@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase):
            "modeling_tf_auto.py",
            "modeling_utils.py",
            "modeling_tf_t5.py",
+            "modeling_bart.py",
+            "modeling_tf_utils.py",
        ]
        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)