Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -90,11 +90,11 @@ class TokenClassificationTask:
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ) -> List[InputFeatures]:
-        """ Loads a data file into a list of `InputFeatures`
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """Loads a data file into a list of `InputFeatures`
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # TODO clean up all this to leverage built-in features of tokenizers

@@ -230,7 +230,8 @@ if is_torch_available():
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
-                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
+                data_dir,
+                "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )

            # Make sure only the first process in distributed training processes the dataset,

--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -14,18 +14,18 @@ def swish(x):


 def _gelu_python(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        This is now written in C in torch.nn.functional
-        Also see https://arxiv.org/abs/1606.08415
+    """Original Implementation of the gelu activation function in Google Bert repo when initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    This is now written in C in torch.nn.functional
+    Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+    Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
                # run additional 10 times to stabilize compilation for tpu and torchscript
                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
                timeit.repeat(
-                    func, repeat=1, number=5,
+                    func,
+                    repeat=1,
+                    number=5,
                )

            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+            runtimes = timeit.repeat(
+                func,
+                repeat=self.args.repeat,
+                number=10,
+            )

            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
                import torch_xla.debug.metrics as met

--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
 @dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
    tpu_name: str = field(
-        default=None, metadata={"help": "Name of TPU"},
+        default=None,
+        metadata={"help": "Name of TPU"},
    )
    device_idx: int = field(
-        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+        default=0,
+        metadata={"help": "CPU / GPU device index. Defaults to 0."},
    )
    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
    use_xla: bool = field(

--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
                    timeit.repeat(func, repeat=1, number=5)

                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+                runtimes = timeit.repeat(
+                    func,
+                    repeat=self.args.repeat,
+                    number=10,
+                )

                return min(runtimes) / 10.0
            except ResourceExhaustedError as e:

--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(

 def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
    """
-        This function wraps another function into its own separated process.
-        In order to ensure accurate memory measurements it is important that the function
-        is executed in a separate process
-
-        Args:
-            - `func`: (`callable`): function() -> ...
-                generic function which will be executed in its own separate process
-            - `do_multi_processing`: (`bool`)
-                Whether to run function on separate process or not
+    This function wraps another function into its own separated process.
+    In order to ensure accurate memory measurements it is important that the function
+    is executed in a separate process
+
+    Args:
+        - `func`: (`callable`): function() -> ...
+            generic function which will be executed in its own separate process
+        - `do_multi_processing`: (`bool`)
+            Whether to run function on separate process or not
    """

    def multi_process_func(*args, **kwargs):
@@ -106,13 +106,13 @@ def is_memory_tracing_enabled():


 class Frame(NamedTuple):
-    """ `Frame` is a NamedTuple used to gather the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    """`Frame` is a NamedTuple used to gather the current frame state.
+    `Frame` has the following fields:
+    - 'filename' (string): Name of the file currently executed
+    - 'module' (string): Name of the module currently executed
+    - 'line_number' (int): Number of the line currently executed
+    - 'event' (string): Event that triggered the tracing (default will be "line")
+    - 'line_text' (string): Text of the line in the python script
    """

    filename: str
@@ -123,10 +123,10 @@ class Frame(NamedTuple):


 class UsedMemoryState(NamedTuple):
-    """ `UsedMemoryState` are named tuples with the following fields:
-        - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-        - 'cpu_memory': CPU RSS memory state *before* executing the line
-        - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    """`UsedMemoryState` are named tuples with the following fields:
+    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+    - 'cpu_memory': CPU RSS memory state *before* executing the line
+    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
    """

    frame: Frame
@@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):


 class Memory(NamedTuple):
-    """ `Memory` NamedTuple have a single field `bytes` and
-        you can get a human readable str of the number of mega bytes by calling `__repr__`
-            - `byte` (integer): number of bytes,
+    """`Memory` NamedTuple have a single field `bytes` and
+    you can get a human readable str of the number of mega bytes by calling `__repr__`
+        - `byte` (integer): number of bytes,
    """

    bytes: int
@@ -147,11 +147,11 @@ class Memory(NamedTuple):


 class MemoryState(NamedTuple):
-    """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-        - `frame` (`Frame`): the current frame (see above)
-        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+    - `frame` (`Frame`): the current frame (see above)
+    - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+    - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+    - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
    """

    frame: Frame
@@ -161,14 +161,14 @@ class MemoryState(NamedTuple):


 class MemorySummary(NamedTuple):
-    """ `MemorySummary` namedtuple otherwise with the fields:
-        - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-            by substracting the memory after executing each line from the memory before executing said line.
-        - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-            obtained by summing repeated memory increase for a line if it's executed several times.
-            The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-        - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-            Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    """`MemorySummary` namedtuple otherwise with the fields:
+    - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+        by substracting the memory after executing each line from the memory before executing said line.
+    - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+        obtained by summing repeated memory increase for a line if it's executed several times.
+        The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+    - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+        Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
    """

    sequential: List[MemoryState]
@@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]

 def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
    """
-        measures peak cpu memory consumption of a given `function`
-        running the function for at least interval seconds
-        and at most 20 * interval seconds.
-        This function is heavily inspired by: `memory_usage`
-        of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
+    measures peak cpu memory consumption of a given `function`
+    running the function for at least interval seconds
+    and at most 20 * interval seconds.
+    This function is heavily inspired by: `memory_usage`
+    of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239

-        Args:
-            - `function`: (`callable`): function() -> ...
-                function without any arguments to measure for which to measure the peak memory
+    Args:
+        - `function`: (`callable`): function() -> ...
+            function without any arguments to measure for which to measure the peak memory

-            - `interval`: (`float`, `optional`, defaults to `0.5`)
-                interval in second for which to measure the memory usage
+        - `interval`: (`float`, `optional`, defaults to `0.5`)
+            interval in second for which to measure the memory usage

-            - `device_idx`: (`int`, `optional`, defaults to `None`)
-                device id for which to measure gpu usage
+        - `device_idx`: (`int`, `optional`, defaults to `None`)
+            device id for which to measure gpu usage

-        Returns:
-            - `max_memory`: (`int`)
-                cosumed memory peak in Bytes
+    Returns:
+        - `max_memory`: (`int`)
+            cosumed memory peak in Bytes
    """

    def get_cpu_memory(process_id: int) -> int:
        """
-            measures current cpu memory usage of a given `process_id`
+        measures current cpu memory usage of a given `process_id`

-            Args:
-                - `process_id`: (`int`)
-                    process_id for which to measure memory
+        Args:
+            - `process_id`: (`int`)
+                process_id for which to measure memory

-            Returns
-                - `memory`: (`int`)
-                    cosumed memory in Bytes
+        Returns
+            - `memory`: (`int`)
+                cosumed memory in Bytes
        """
        process = psutil.Process(process_id)
        try:
@@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
        class MemoryMeasureProcess(Process):

            """
-                `MemoryMeasureProcess` inherits from `Process` and overwrites
-                its `run()` method. Used to measure the memory usage of a process
+            `MemoryMeasureProcess` inherits from `Process` and overwrites
+            its `run()` method. Used to measure the memory usage of a process
            """

            def __init__(self, process_id: int, child_connection: Connection, interval: float):
@@ -309,37 +309,37 @@ def start_memory_tracing(
    events_to_trace: str = "line",
    gpus_to_trace: Optional[List[int]] = None,
 ) -> MemoryTrace:
-    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
-        See `./benchmark.py` for usage examples.
-        Current memory consumption is returned using psutil and in particular is the RSS memory
-            "Resident Set Size” (the non-swapped physical memory the process is using).
-            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
-
-        Args:
-            - `modules_to_trace`: (None, string, list/tuple of string)
-                if None, all events are recorded
-                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
-            - `modules_not_to_trace`: (None, string, list/tuple of string)
-                if None, no module is avoided
-                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
-            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
-                default to line
-            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
-
-        Return:
-            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
-                - `UsedMemoryState` are named tuples with the following fields:
-                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'cpu_memory': CPU RSS memory state *before* executing the line
-                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
-
-        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    """Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+    See `./benchmark.py` for usage examples.
+    Current memory consumption is returned using psutil and in particular is the RSS memory
+        "Resident Set Size” (the non-swapped physical memory the process is using).
+        See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
+
+    Args:
+        - `modules_to_trace`: (None, string, list/tuple of string)
+            if None, all events are recorded
+            if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+        - `modules_not_to_trace`: (None, string, list/tuple of string)
+            if None, no module is avoided
+            if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
+        - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+            default to line
+        - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
+
+    Return:
+        - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+            - `UsedMemoryState` are named tuples with the following fields:
+                - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                - 'cpu_memory': CPU RSS memory state *before* executing the line
+                - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+
+    `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+        `Frame` has the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script

    """
    if is_psutil_available():
@@ -371,8 +371,8 @@ def start_memory_tracing(
    memory_trace = []

    def traceit(frame, event, args):
-        """ Tracing method executed before running each line in a module or sub-module
-            Record memory allocated in a list with debugging information
+        """Tracing method executed before running each line in a module or sub-module
+        Record memory allocated in a list with debugging information
        """
        global _is_memory_tracing_enabled

@@ -456,39 +456,39 @@ def start_memory_tracing(
 def stop_memory_tracing(
    memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
 ) -> Optional[MemorySummary]:
-    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
-
-        Args:
-            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
-
-        Return:
-            - None if `memory_trace` is None
-            - `MemorySummary` namedtuple otherwise with the fields:
-                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-                    by substracting the memory after executing each line from the memory before executing said line.
-                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-                    obtained by summing repeated memory increase for a line if it's executed several times.
-                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
-
-        `Memory` named tuple have fields
-            - `byte` (integer): number of bytes,
-            - `string` (string): same as human readable string (ex: "3.5MB")
-
-        `Frame` are namedtuple used to list the current frame state and have the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
-
-        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-            - `frame` (`Frame`): the current frame (see above)
-            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
+
+    Args:
+        - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+        - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
+
+    Return:
+        - None if `memory_trace` is None
+        - `MemorySummary` namedtuple otherwise with the fields:
+            - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+                by substracting the memory after executing each line from the memory before executing said line.
+            - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+                obtained by summing repeated memory increase for a line if it's executed several times.
+                The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+            - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+                Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+
+    `Memory` named tuple have fields
+        - `byte` (integer): number of bytes,
+        - `string` (string): same as human readable string (ex: "3.5MB")
+
+    `Frame` are namedtuple used to list the current frame state and have the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script
+
+    `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+        - `frame` (`Frame`): the current frame (see above)
+        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
    """
    global _is_memory_tracing_enabled
    _is_memory_tracing_enabled = False
@@ -499,15 +499,19 @@ def stop_memory_tracing(

        cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])

-        for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
-            memory_trace[:-1], memory_trace[1:]
-        ):
+        for (
+            (frame, cpu_mem, gpu_mem),
+            (next_frame, next_cpu_mem, next_gpu_mem),
+        ) in zip(memory_trace[:-1], memory_trace[1:]):
            cpu_mem_inc = next_cpu_mem - cpu_mem
            gpu_mem_inc = next_gpu_mem - gpu_mem
            cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
            memory_diff_trace.append(
                MemoryState(
-                    frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                    frame=frame,
+                    cpu=Memory(cpu_mem_inc),
+                    gpu=Memory(gpu_mem_inc),
+                    cpu_gpu=Memory(cpu_gpu_mem_inc),
                )
            )

@@ -529,7 +533,10 @@ def stop_memory_tracing(
        )  # order by the total CPU + GPU memory increase
        cumulative_memory = list(
            MemoryState(
-                frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                frame=frame,
+                cpu=Memory(cpu_mem_inc),
+                gpu=Memory(gpu_mem_inc),
+                cpu_gpu=Memory(cpu_gpu_mem_inc),
            )
            for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
        )
@@ -544,15 +551,17 @@ def stop_memory_tracing(
        total_memory = Memory(total_memory)

        return MemorySummary(
-            sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
+            sequential=memory_diff_trace,
+            cumulative=cumulative_memory,
+            current=memory_curr_trace,
+            total=total_memory,
        )

    return None


 def bytes_to_mega_bytes(memory_amount: int) -> int:
-    """ Utility to convert a number of bytes (int) into a number of mega bytes (int)
-    """
+    """Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
    return memory_amount >> 20



--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class AlbertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
-        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30000):
-                Vocabulary size of the ALBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of vocabulary embeddings.
-            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_hidden_groups (:obj:`int`, optional, defaults to 1):
-                Number of groups for the hidden layers, parameters in the same group are shared.
-            num_attention_heads (:obj:`int`, optional, defaults to 64):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            inner_group_num (:obj:`int`, optional, defaults to 1):
-                The number of inner repetition of attention and ffn.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with. Typically set this to something
-                large (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for attached classifiers.
-
-        Example::
-
-            >>> from transformers import AlbertConfig, AlbertModel
-            >>> # Initializing an ALBERT-xxlarge style configuration
-            >>> albert_xxlarge_configuration = AlbertConfig()
-
-            >>> # Initializing an ALBERT-base style configuration
-            >>> albert_base_configuration = AlbertConfig(
-            ...      hidden_size=768,
-            ...      num_attention_heads=12,
-            ...      intermediate_size=3072,
-            ...  )
-
-            >>> # Initializing a model from the ALBERT-base style configuration
-            >>> model = AlbertModel(albert_xxlarge_configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
+    It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30000):
+            Vocabulary size of the ALBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_hidden_groups (:obj:`int`, optional, defaults to 1):
+            Number of groups for the hidden layers, parameters in the same group are shared.
+        num_attention_heads (:obj:`int`, optional, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 16384):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        inner_group_num (:obj:`int`, optional, defaults to 1):
+            The number of inner repetition of attention and ffn.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something
+            large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+
+    Example::
+
+        >>> from transformers import AlbertConfig, AlbertModel
+        >>> # Initializing an ALBERT-xxlarge style configuration
+        >>> albert_xxlarge_configuration = AlbertConfig()
+
+        >>> # Initializing an ALBERT-base style configuration
+        >>> albert_base_configuration = AlbertConfig(
+        ...      hidden_size=768,
+        ...      num_attention_heads=12,
+        ...      intermediate_size=3072,
+        ...  )
+
+        >>> # Initializing a model from the ALBERT-base style configuration
+        >>> model = AlbertModel(albert_xxlarge_configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "albert"

--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(

 CONFIG_MAPPING = OrderedDict(
    [
-        ("retribert", RetriBertConfig,),
-        ("t5", T5Config,),
-        ("mobilebert", MobileBertConfig,),
-        ("distilbert", DistilBertConfig,),
-        ("albert", AlbertConfig,),
-        ("camembert", CamembertConfig,),
-        ("xlm-roberta", XLMRobertaConfig,),
+        (
+            "retribert",
+            RetriBertConfig,
+        ),
+        (
+            "t5",
+            T5Config,
+        ),
+        (
+            "mobilebert",
+            MobileBertConfig,
+        ),
+        (
+            "distilbert",
+            DistilBertConfig,
+        ),
+        (
+            "albert",
+            AlbertConfig,
+        ),
+        (
+            "camembert",
+            CamembertConfig,
+        ),
+        (
+            "xlm-roberta",
+            XLMRobertaConfig,
+        ),
        ("pegasus", PegasusConfig),
-        ("marian", MarianConfig,),
-        ("mbart", MBartConfig,),
-        ("bart", BartConfig,),
-        ("reformer", ReformerConfig,),
-        ("longformer", LongformerConfig,),
-        ("roberta", RobertaConfig,),
-        ("flaubert", FlaubertConfig,),
-        ("bert", BertConfig,),
-        ("openai-gpt", OpenAIGPTConfig,),
-        ("gpt2", GPT2Config,),
-        ("transfo-xl", TransfoXLConfig,),
-        ("xlnet", XLNetConfig,),
-        ("xlm", XLMConfig,),
-        ("ctrl", CTRLConfig,),
-        ("electra", ElectraConfig,),
-        ("encoder-decoder", EncoderDecoderConfig,),
+        (
+            "marian",
+            MarianConfig,
+        ),
+        (
+            "mbart",
+            MBartConfig,
+        ),
+        (
+            "bart",
+            BartConfig,
+        ),
+        (
+            "reformer",
+            ReformerConfig,
+        ),
+        (
+            "longformer",
+            LongformerConfig,
+        ),
+        (
+            "roberta",
+            RobertaConfig,
+        ),
+        (
+            "flaubert",
+            FlaubertConfig,
+        ),
+        (
+            "bert",
+            BertConfig,
+        ),
+        (
+            "openai-gpt",
+            OpenAIGPTConfig,
+        ),
+        (
+            "gpt2",
+            GPT2Config,
+        ),
+        (
+            "transfo-xl",
+            TransfoXLConfig,
+        ),
+        (
+            "xlnet",
+            XLNetConfig,
+        ),
+        (
+            "xlm",
+            XLMConfig,
+        ),
+        (
+            "ctrl",
+            CTRLConfig,
+        ),
+        (
+            "electra",
+            ElectraConfig,
+        ),
+        (
+            "encoder-decoder",
+            EncoderDecoderConfig,
+        ),
    ]
 )


 class AutoConfig:
    r"""
-        :class:`~transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
+    :class:`~transformers.AutoConfig` is a generic configuration class
+    that will be instantiated as one of the configuration classes of the library
+    when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.

-        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
+    The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string.
    """

    def __init__(self):

--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class BartConfig(PretrainedConfig):
    r"""
-        Configuration class for Bart. Parameters are renamed from the fairseq implementation
+    Configuration class for Bart. Parameters are renamed from the fairseq implementation
    """
    model_type = "bart"

@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
        **common_kwargs
    ):
        r"""
-            :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
+        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.

-            Examples::
+        Examples::

-                >>> from transformers import BartConfig, BartModel
+            >>> from transformers import BartConfig, BartModel

-                >>> config = BartConfig.from_pretrained('facebook/bart-large')
-                >>> model = BartModel(config)
+            >>> config = BartConfig.from_pretrained('facebook/bart-large')
+            >>> model = BartModel(config)

        """
        if "hidden_size" in common_kwargs:

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class BertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            gradient_checkpointing (:obj:`bool`, optional, defaults to False):
-                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-
-        Example::
-
-            >>> from transformers import BertModel, BertConfig
-
-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> configuration = BertConfig()
-
-            >>> # Initializing a model from the bert-base-uncased style configuration
-            >>> model = BertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+    It is used to instantiate an BERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+    Example::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "bert"


--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h

 class CTRLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
-        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 246534):
-                Vocabulary size of the CTRL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 256):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 1280):
-                Dimensionality of the embeddings and hidden states.
-            dff (:obj:`int`, optional, defaults to 8192):
-                Dimensionality of the inner dimension of the FFN.
-            n_layer (:obj:`int`, optional, defaults to 48):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-
-        Example::
-
-            >>> from transformers import CTRLModel, CTRLConfig
-
-            >>> # Initializing a CTRL configuration
-            >>> configuration = CTRLConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = CTRLModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
+    It is used to instantiate an CTRL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 256):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (:obj:`int`, optional, defaults to 8192):
+            Dimensionality of the inner dimension of the FFN.
+        n_layer (:obj:`int`, optional, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example::
+
+        >>> from transformers import CTRLModel, CTRLConfig
+
+        >>> # Initializing a CTRL configuration
+        >>> configuration = CTRLConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = CTRLModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "ctrl"

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DistilBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
-        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the DistilBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings.
-            n_layers (:obj:`int`, optional, defaults to 6):
-                Number of hidden layers in the Transformer encoder.
-            n_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dim (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_dim (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            qa_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilities used in the question answering model
-                :class:`~transformers.DistilBertForQuestionAnswering`.
-            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
-                The dropout probabilities used in the sequence classification and the multiple choice model
-                :class:`~transformers.DistilBertForSequenceClassification`.
-
-        Example::
-
-            >>> from transformers import DistilBertModel, DistilBertConfig
-
-            >>> # Initializing a DistilBERT configuration
-            >>> configuration = DistilBertConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = DistilBertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+    It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings.
+        n_layers (:obj:`int`, optional, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        n_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_dim (:obj:`int`, optional, defaults to 3072):
+            The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qa_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilities used in the question answering model
+            :class:`~transformers.DistilBertForQuestionAnswering`.
+        seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+            The dropout probabilities used in the sequence classification and the multiple choice model
+            :class:`~transformers.DistilBertForSequenceClassification`.
+
+    Example::
+
+        >>> from transformers import DistilBertModel, DistilBertConfig
+
+        >>> # Initializing a DistilBERT configuration
+        >>> configuration = DistilBertConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = DistilBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "distilbert"


--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DPRConfig(BertConfig):
    r"""
-        :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
-        `DPRModel`.
+    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
+    `DPRModel`.

-        This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
-        It is used to instantiate the components of the DPR model.
+    This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
+    It is used to instantiate the components of the DPR model.

-        Args:
-            projection_dim (:obj:`int`, optional, defaults to 0):
-                Dimension of the projection for the context and question encoders.
-                If it is set to zero (default), then no projection is done.
+    Args:
+        projection_dim (:obj:`int`, optional, defaults to 0):
+            Dimension of the projection for the context and question encoders.
+            If it is set to zero (default), then no projection is done.
    """
    model_type = "dpr"


--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ElectraConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
-        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
-        architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the ELECTRA model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Is one of the following options:
-
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                'gelu' => add a gelu activation to the output, Other => no activation.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a dropout after the projection and activation
-
-        Example::
-
-            >>> from transformers import ElectraModel, ElectraConfig
-
-            >>> # Initializing a ELECTRA electra-base-uncased style configuration
-            >>> configuration = ElectraConfig()
-
-            >>> # Initializing a model from the electra-base-uncased style configuration
-            >>> model = ElectraModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+    It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+    architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the ELECTRA model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Is one of the following options:
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            'gelu' => add a gelu activation to the output, Other => no activation.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a dropout after the projection and activation
+
+    Example::
+
+        >>> from transformers import ElectraModel, ElectraConfig
+
+        >>> # Initializing a ELECTRA electra-base-uncased style configuration
+        >>> configuration = ElectraConfig()
+
+        >>> # Initializing a model from the electra-base-uncased style configuration
+        >>> model = ElectraModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "electra"


--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)

 class EncoderDecoderConfig(PretrainedConfig):
    r"""
-        :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
+    :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.

-        It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
-        and can be used to control the model outputs.
-        See the documentation for :class:`~transformers.PretrainedConfig` for more information.
+    It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs.
+    See the documentation for :class:`~transformers.PretrainedConfig` for more information.

-        Args:
-            kwargs (`optional`):
-                Remaining dictionary of keyword arguments. Notably:
-                    encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the encoder config.
-                    decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the decoder config.
+    Args:
+        kwargs (`optional`):
+            Remaining dictionary of keyword arguments. Notably:
+                encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the encoder config.
+                decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the decoder config.

-        Example::
+    Example::

-            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+        >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> config_encoder = BertConfig()
-            >>> config_decoder = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> config_encoder = BertConfig()
+        >>> config_decoder = BertConfig()

-            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+        >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

-            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            >>> model = EncoderDecoderModel(config=config)
+        >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+        >>> model = EncoderDecoderModel(config=config)

-            >>> # Accessing the model configuration
-            >>> config_encoder = model.config.encoder
-            >>> config_decoder  = model.config.decoder
-            >>> # set decoder config to causal lm
-            >>> config_decoder.is_decoder = True
-            >>> config_decoder.add_cross_attention = True
+        >>> # Accessing the model configuration
+        >>> config_encoder = model.config.encoder
+        >>> config_decoder  = model.config.decoder
+        >>> # set decoder config to causal lm
+        >>> config_decoder.is_decoder = True
+        >>> config_decoder.add_cross_attention = True

-            >>> # Saving the model, including its configuration
-            >>> model.save_pretrained('my-model')
+        >>> # Saving the model, including its configuration
+        >>> model.save_pretrained('my-model')

-            >>> # loading model and config from pretrained folder
-            >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-            >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+        >>> # loading model and config from pretrained folder
+        >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+        >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
    """
    model_type = "encoder_decoder"


--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class FlaubertConfig(XLMConfig):
    """
-        Configuration class to store the configuration of a `FlaubertModel`.
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    Configuration class to store the configuration of a `FlaubertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
-                with Structured Dropout. ICLR 2020)
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the
+            attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+            with Structured Dropout. ICLR 2020)
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the Flaubert model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
    """

    model_type = "flaubert"

    def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
-        """Constructs FlaubertConfig.
-        """
+        """Constructs FlaubertConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
        self.layerdrop = layerdrop
        self.pre_norm = pre_norm
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class GPT2Config(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 50257):
-                Vocabulary size of the GPT-2 model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-            n_positions (:obj:`int`, optional, defaults to 1024):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            n_inner (:obj:`int`, optional, defaults to None):
-                Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-            activation_function (:obj:`str`, optional, defaults to 'gelu'):
-                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            >>> from transformers import GPT2Model, GPT2Config
-
-            >>> # Initializing a GPT2 configuration
-            >>> configuration = GPT2Config()
-
-            >>> # Initializing a model from the configuration
-            >>> model = GPT2Model(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
+    It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
+        n_positions (:obj:`int`, optional, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (:obj:`int`, optional, defaults to None):
+            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
+        activation_function (:obj:`str`, optional, defaults to 'gelu'):
+            Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a dropout before the projection and activation
+
+    Example::
+
+        >>> from transformers import GPT2Model, GPT2Config
+
+        >>> # Initializing a GPT2 configuration
+        >>> configuration = GPT2Config()
+
+        >>> # Initializing a model from the configuration
+        >>> model = GPT2Model(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "gpt2"

--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class LongformerConfig(RobertaConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
-        It is used to instantiate an Longformer model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
+    It is used to instantiate an Longformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.

-        The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Args:
-            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
-                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
-                To specify a different window size for each layer, use a :obj:`List[int]` where
-                ``len(attention_window) == num_hidden_layers``.
+    Args:
+        attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
+            Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
+            To specify a different window size for each layer, use a :obj:`List[int]` where
+            ``len(attention_window) == num_hidden_layers``.

-        Example::
+    Example::

-            >>> from transformers import LongformerConfig, LongformerModel
+        >>> from transformers import LongformerConfig, LongformerModel

-            >>> # Initializing a Longformer configuration
-            >>> configuration = LongformerConfig()
+        >>> # Initializing a Longformer configuration
+        >>> configuration = LongformerConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = LongformerModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = LongformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "longformer"


--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class MobileBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
-        It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
-        architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the MobileBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The ID of the token in the word embedding to use as padding.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                The dimension of the word embedding vectors.
-            trigram_input (:obj:`bool`, optional, defaults to True):
-                Use a convolution of trigram as input.
-            use_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use bottleneck in BERT.
-            intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
-                Size of bottleneck layer output.
-            use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
-                Whether to use attention inputs from the bottleneck transformation.
-            key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use the same linear transformation for query&key in the bottleneck.
-            num_feedforward_networks (:obj:`int`, optional, defaults to 4):
-                Number of FFNs in a block.
-            normalization_type (:obj:`str`, optional, defaults to "no_norm"):
-                The normalization type in BERT.
-
-        Example:
-
-            >>> from transformers import MobileBertModel, MobileBertConfig
-
-            >>> # Initializing a MobileBERT configuration
-            >>> configuration = MobileBertConfig()
-
-            >>> # Initializing a model from the configuration above
-            >>> model = MobileBertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
+    This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
+    It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the MobileBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            The dimension of the word embedding vectors.
+        trigram_input (:obj:`bool`, optional, defaults to True):
+            Use a convolution of trigram as input.
+        use_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use bottleneck in BERT.
+        intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
+            Size of bottleneck layer output.
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+            Whether to use attention inputs from the bottleneck transformation.
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use the same linear transformation for query&key in the bottleneck.
+        num_feedforward_networks (:obj:`int`, optional, defaults to 4):
+            Number of FFNs in a block.
+        normalization_type (:obj:`str`, optional, defaults to "no_norm"):
+            The normalization type in BERT.
+
+    Example:
+
+        >>> from transformers import MobileBertModel, MobileBertConfig
+
+        >>> # Initializing a MobileBERT configuration
+        >>> configuration = MobileBertConfig()
+
+        >>> # Initializing a model from the configuration above
+        >>> model = MobileBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+
+    Attributes:
+        pretrained_config_archive_map (Dict[str, str]):
+            A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "mobilebert"

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class OpenAIGPTConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
-        It is used to instantiate an GPT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 40478):
-                Vocabulary size of the GPT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether special tokens should be predicted when the model is has a language modeling head.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
-
-            >>> # Initializing a GPT configuration
-            >>> configuration = OpenAIGPTConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = OpenAIGPTModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
+    It is used to instantiate an GPT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 40478):
+            Vocabulary size of the GPT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether special tokens should be predicted when the model is has a language modeling head.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a dropout before the projection and activation
+
+    Example::
+
+        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+
+        >>> # Initializing a GPT configuration
+        >>> configuration = OpenAIGPTConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = OpenAIGPTModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "openai-gpt"