Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
...@@ -90,11 +90,11 @@ class TokenClassificationTask: ...@@ -90,11 +90,11 @@ class TokenClassificationTask:
sequence_a_segment_id=0, sequence_a_segment_id=0,
mask_padding_with_zero=True, mask_padding_with_zero=True,
) -> List[InputFeatures]: ) -> List[InputFeatures]:
""" Loads a data file into a list of `InputFeatures` """Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token: `cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
""" """
# TODO clean up all this to leverage built-in features of tokenizers # TODO clean up all this to leverage built-in features of tokenizers
...@@ -230,7 +230,8 @@ if is_torch_available(): ...@@ -230,7 +230,8 @@ if is_torch_available():
): ):
# Load data features from cache or dataset file # Load data features from cache or dataset file
cached_features_file = os.path.join( cached_features_file = os.path.join(
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)), data_dir,
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
) )
# Make sure only the first process in distributed training processes the dataset, # Make sure only the first process in distributed training processes the dataset,
......
...@@ -14,18 +14,18 @@ def swish(x): ...@@ -14,18 +14,18 @@ def swish(x):
def _gelu_python(x): def _gelu_python(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created. """Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
This is now written in C in torch.nn.functional This is now written in C in torch.nn.functional
Also see https://arxiv.org/abs/1606.08415 Also see https://arxiv.org/abs/1606.08415
""" """
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def gelu_new(x): def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415 Also see https://arxiv.org/abs/1606.08415
""" """
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
......
...@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark): ...@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
# run additional 10 times to stabilize compilation for tpu and torchscript # run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation") logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat( timeit.repeat(
func, repeat=1, number=5, func,
repeat=1,
number=5,
) )
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,) runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics: if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
import torch_xla.debug.metrics as met import torch_xla.debug.metrics as met
......
...@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__) ...@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
@dataclass @dataclass
class TensorFlowBenchmarkArguments(BenchmarkArguments): class TensorFlowBenchmarkArguments(BenchmarkArguments):
tpu_name: str = field( tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"}, default=None,
metadata={"help": "Name of TPU"},
) )
device_idx: int = field( device_idx: int = field(
default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."}, default=0,
metadata={"help": "CPU / GPU device index. Defaults to 0."},
) )
eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."}) eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
use_xla: bool = field( use_xla: bool = field(
......
...@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark): ...@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
timeit.repeat(func, repeat=1, number=5) timeit.repeat(func, repeat=1, number=5)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,) runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
return min(runtimes) / 10.0 return min(runtimes) / 10.0
except ResourceExhaustedError as e: except ResourceExhaustedError as e:
......
...@@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple( ...@@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(
def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]: def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
""" """
This function wraps another function into its own separated process. This function wraps another function into its own separated process.
In order to ensure accurate memory measurements it is important that the function In order to ensure accurate memory measurements it is important that the function
is executed in a separate process is executed in a separate process
Args: Args:
- `func`: (`callable`): function() -> ... - `func`: (`callable`): function() -> ...
generic function which will be executed in its own separate process generic function which will be executed in its own separate process
- `do_multi_processing`: (`bool`) - `do_multi_processing`: (`bool`)
Whether to run function on separate process or not Whether to run function on separate process or not
""" """
def multi_process_func(*args, **kwargs): def multi_process_func(*args, **kwargs):
...@@ -106,13 +106,13 @@ def is_memory_tracing_enabled(): ...@@ -106,13 +106,13 @@ def is_memory_tracing_enabled():
class Frame(NamedTuple): class Frame(NamedTuple):
""" `Frame` is a NamedTuple used to gather the current frame state. """`Frame` is a NamedTuple used to gather the current frame state.
`Frame` has the following fields: `Frame` has the following fields:
- 'filename' (string): Name of the file currently executed - 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed - 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed - 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line") - 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script - 'line_text' (string): Text of the line in the python script
""" """
filename: str filename: str
...@@ -123,10 +123,10 @@ class Frame(NamedTuple): ...@@ -123,10 +123,10 @@ class Frame(NamedTuple):
class UsedMemoryState(NamedTuple): class UsedMemoryState(NamedTuple):
""" `UsedMemoryState` are named tuples with the following fields: """`UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line - 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
""" """
frame: Frame frame: Frame
...@@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple): ...@@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):
class Memory(NamedTuple): class Memory(NamedTuple):
""" `Memory` NamedTuple have a single field `bytes` and """`Memory` NamedTuple have a single field `bytes` and
you can get a human readable str of the number of mega bytes by calling `__repr__` you can get a human readable str of the number of mega bytes by calling `__repr__`
- `byte` (integer): number of bytes, - `byte` (integer): number of bytes,
""" """
bytes: int bytes: int
...@@ -147,11 +147,11 @@ class Memory(NamedTuple): ...@@ -147,11 +147,11 @@ class Memory(NamedTuple):
class MemoryState(NamedTuple): class MemoryState(NamedTuple):
""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: """`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above) - `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
""" """
frame: Frame frame: Frame
...@@ -161,14 +161,14 @@ class MemoryState(NamedTuple): ...@@ -161,14 +161,14 @@ class MemoryState(NamedTuple):
class MemorySummary(NamedTuple): class MemorySummary(NamedTuple):
""" `MemorySummary` namedtuple otherwise with the fields: """`MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line. by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times. obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
""" """
sequential: List[MemoryState] sequential: List[MemoryState]
...@@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState] ...@@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]
def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int: def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
""" """
measures peak cpu memory consumption of a given `function` measures peak cpu memory consumption of a given `function`
running the function for at least interval seconds running the function for at least interval seconds
and at most 20 * interval seconds. and at most 20 * interval seconds.
This function is heavily inspired by: `memory_usage` This function is heavily inspired by: `memory_usage`
of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239 of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
Args: Args:
- `function`: (`callable`): function() -> ... - `function`: (`callable`): function() -> ...
function without any arguments to measure for which to measure the peak memory function without any arguments to measure for which to measure the peak memory
- `interval`: (`float`, `optional`, defaults to `0.5`) - `interval`: (`float`, `optional`, defaults to `0.5`)
interval in second for which to measure the memory usage interval in second for which to measure the memory usage
- `device_idx`: (`int`, `optional`, defaults to `None`) - `device_idx`: (`int`, `optional`, defaults to `None`)
device id for which to measure gpu usage device id for which to measure gpu usage
Returns: Returns:
- `max_memory`: (`int`) - `max_memory`: (`int`)
cosumed memory peak in Bytes cosumed memory peak in Bytes
""" """
def get_cpu_memory(process_id: int) -> int: def get_cpu_memory(process_id: int) -> int:
""" """
measures current cpu memory usage of a given `process_id` measures current cpu memory usage of a given `process_id`
Args: Args:
- `process_id`: (`int`) - `process_id`: (`int`)
process_id for which to measure memory process_id for which to measure memory
Returns Returns
- `memory`: (`int`) - `memory`: (`int`)
cosumed memory in Bytes cosumed memory in Bytes
""" """
process = psutil.Process(process_id) process = psutil.Process(process_id)
try: try:
...@@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i ...@@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
class MemoryMeasureProcess(Process): class MemoryMeasureProcess(Process):
""" """
`MemoryMeasureProcess` inherits from `Process` and overwrites `MemoryMeasureProcess` inherits from `Process` and overwrites
its `run()` method. Used to measure the memory usage of a process its `run()` method. Used to measure the memory usage of a process
""" """
def __init__(self, process_id: int, child_connection: Connection, interval: float): def __init__(self, process_id: int, child_connection: Connection, interval: float):
...@@ -309,37 +309,37 @@ def start_memory_tracing( ...@@ -309,37 +309,37 @@ def start_memory_tracing(
events_to_trace: str = "line", events_to_trace: str = "line",
gpus_to_trace: Optional[List[int]] = None, gpus_to_trace: Optional[List[int]] = None,
) -> MemoryTrace: ) -> MemoryTrace:
""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. """Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
See `./benchmark.py` for usage examples. See `./benchmark.py` for usage examples.
Current memory consumption is returned using psutil and in particular is the RSS memory Current memory consumption is returned using psutil and in particular is the RSS memory
"Resident Set Size” (the non-swapped physical memory the process is using). "Resident Set Size” (the non-swapped physical memory the process is using).
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
Args: Args:
- `modules_to_trace`: (None, string, list/tuple of string) - `modules_to_trace`: (None, string, list/tuple of string)
if None, all events are recorded if None, all events are recorded
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2') if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
- `modules_not_to_trace`: (None, string, list/tuple of string) - `modules_not_to_trace`: (None, string, list/tuple of string)
if None, no module is avoided if None, no module is avoided
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch') if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events) - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
default to line default to line
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
Return: Return:
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script). - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
- `UsedMemoryState` are named tuples with the following fields: - `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line - 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
`Frame` has the following fields: `Frame` has the following fields:
- 'filename' (string): Name of the file currently executed - 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed - 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed - 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line") - 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script - 'line_text' (string): Text of the line in the python script
""" """
if is_psutil_available(): if is_psutil_available():
...@@ -371,8 +371,8 @@ def start_memory_tracing( ...@@ -371,8 +371,8 @@ def start_memory_tracing(
memory_trace = [] memory_trace = []
def traceit(frame, event, args): def traceit(frame, event, args):
""" Tracing method executed before running each line in a module or sub-module """Tracing method executed before running each line in a module or sub-module
Record memory allocated in a list with debugging information Record memory allocated in a list with debugging information
""" """
global _is_memory_tracing_enabled global _is_memory_tracing_enabled
...@@ -456,39 +456,39 @@ def start_memory_tracing( ...@@ -456,39 +456,39 @@ def start_memory_tracing(
def stop_memory_tracing( def stop_memory_tracing(
memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
) -> Optional[MemorySummary]: ) -> Optional[MemorySummary]:
""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given. """Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
Args: Args:
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
Return: Return:
- None if `memory_trace` is None - None if `memory_trace` is None
- `MemorySummary` namedtuple otherwise with the fields: - `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line. by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times. obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
`Memory` named tuple have fields `Memory` named tuple have fields
- `byte` (integer): number of bytes, - `byte` (integer): number of bytes,
- `string` (string): same as human readable string (ex: "3.5MB") - `string` (string): same as human readable string (ex: "3.5MB")
`Frame` are namedtuple used to list the current frame state and have the following fields: `Frame` are namedtuple used to list the current frame state and have the following fields:
- 'filename' (string): Name of the file currently executed - 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed - 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed - 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line") - 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script - 'line_text' (string): Text of the line in the python script
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above) - `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
""" """
global _is_memory_tracing_enabled global _is_memory_tracing_enabled
_is_memory_tracing_enabled = False _is_memory_tracing_enabled = False
...@@ -499,15 +499,19 @@ def stop_memory_tracing( ...@@ -499,15 +499,19 @@ def stop_memory_tracing(
cumulative_memory_dict = defaultdict(lambda: [0, 0, 0]) cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip( for (
memory_trace[:-1], memory_trace[1:] (frame, cpu_mem, gpu_mem),
): (next_frame, next_cpu_mem, next_gpu_mem),
) in zip(memory_trace[:-1], memory_trace[1:]):
cpu_mem_inc = next_cpu_mem - cpu_mem cpu_mem_inc = next_cpu_mem - cpu_mem
gpu_mem_inc = next_gpu_mem - gpu_mem gpu_mem_inc = next_gpu_mem - gpu_mem
cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
memory_diff_trace.append( memory_diff_trace.append(
MemoryState( MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), frame=frame,
cpu=Memory(cpu_mem_inc),
gpu=Memory(gpu_mem_inc),
cpu_gpu=Memory(cpu_gpu_mem_inc),
) )
) )
...@@ -529,7 +533,10 @@ def stop_memory_tracing( ...@@ -529,7 +533,10 @@ def stop_memory_tracing(
) # order by the total CPU + GPU memory increase ) # order by the total CPU + GPU memory increase
cumulative_memory = list( cumulative_memory = list(
MemoryState( MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), frame=frame,
cpu=Memory(cpu_mem_inc),
gpu=Memory(gpu_mem_inc),
cpu_gpu=Memory(cpu_gpu_mem_inc),
) )
for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
) )
...@@ -544,15 +551,17 @@ def stop_memory_tracing( ...@@ -544,15 +551,17 @@ def stop_memory_tracing(
total_memory = Memory(total_memory) total_memory = Memory(total_memory)
return MemorySummary( return MemorySummary(
sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory, sequential=memory_diff_trace,
cumulative=cumulative_memory,
current=memory_curr_trace,
total=total_memory,
) )
return None return None
def bytes_to_mega_bytes(memory_amount: int) -> int: def bytes_to_mega_bytes(memory_amount: int) -> int:
""" Utility to convert a number of bytes (int) into a number of mega bytes (int) """Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
"""
return memory_amount >> 20 return memory_amount >> 20
......
...@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`. This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture. the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30000): vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings. Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096): hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1): num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared. Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64): num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384): intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1): inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn. The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0): hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048). large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for attached classifiers. The dropout ratio for attached classifiers.
Example:: Example::
>>> from transformers import AlbertConfig, AlbertModel >>> from transformers import AlbertConfig, AlbertModel
>>> # Initializing an ALBERT-xxlarge style configuration >>> # Initializing an ALBERT-xxlarge style configuration
>>> albert_xxlarge_configuration = AlbertConfig() >>> albert_xxlarge_configuration = AlbertConfig()
>>> # Initializing an ALBERT-base style configuration >>> # Initializing an ALBERT-base style configuration
>>> albert_base_configuration = AlbertConfig( >>> albert_base_configuration = AlbertConfig(
... hidden_size=768, ... hidden_size=768,
... num_attention_heads=12, ... num_attention_heads=12,
... intermediate_size=3072, ... intermediate_size=3072,
... ) ... )
>>> # Initializing a model from the ALBERT-base style configuration >>> # Initializing a model from the ALBERT-base style configuration
>>> model = AlbertModel(albert_xxlarge_configuration) >>> model = AlbertModel(albert_xxlarge_configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "albert" model_type = "albert"
......
...@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( ...@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
CONFIG_MAPPING = OrderedDict( CONFIG_MAPPING = OrderedDict(
[ [
("retribert", RetriBertConfig,), (
("t5", T5Config,), "retribert",
("mobilebert", MobileBertConfig,), RetriBertConfig,
("distilbert", DistilBertConfig,), ),
("albert", AlbertConfig,), (
("camembert", CamembertConfig,), "t5",
("xlm-roberta", XLMRobertaConfig,), T5Config,
),
(
"mobilebert",
MobileBertConfig,
),
(
"distilbert",
DistilBertConfig,
),
(
"albert",
AlbertConfig,
),
(
"camembert",
CamembertConfig,
),
(
"xlm-roberta",
XLMRobertaConfig,
),
("pegasus", PegasusConfig), ("pegasus", PegasusConfig),
("marian", MarianConfig,), (
("mbart", MBartConfig,), "marian",
("bart", BartConfig,), MarianConfig,
("reformer", ReformerConfig,), ),
("longformer", LongformerConfig,), (
("roberta", RobertaConfig,), "mbart",
("flaubert", FlaubertConfig,), MBartConfig,
("bert", BertConfig,), ),
("openai-gpt", OpenAIGPTConfig,), (
("gpt2", GPT2Config,), "bart",
("transfo-xl", TransfoXLConfig,), BartConfig,
("xlnet", XLNetConfig,), ),
("xlm", XLMConfig,), (
("ctrl", CTRLConfig,), "reformer",
("electra", ElectraConfig,), ReformerConfig,
("encoder-decoder", EncoderDecoderConfig,), ),
(
"longformer",
LongformerConfig,
),
(
"roberta",
RobertaConfig,
),
(
"flaubert",
FlaubertConfig,
),
(
"bert",
BertConfig,
),
(
"openai-gpt",
OpenAIGPTConfig,
),
(
"gpt2",
GPT2Config,
),
(
"transfo-xl",
TransfoXLConfig,
),
(
"xlnet",
XLNetConfig,
),
(
"xlm",
XLMConfig,
),
(
"ctrl",
CTRLConfig,
),
(
"electra",
ElectraConfig,
),
(
"encoder-decoder",
EncoderDecoderConfig,
),
] ]
) )
class AutoConfig: class AutoConfig:
r""" r"""
:class:`~transformers.AutoConfig` is a generic configuration class :class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method. when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
""" """
def __init__(self): def __init__(self):
......
...@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r""" ...@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC) @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
class BartConfig(PretrainedConfig): class BartConfig(PretrainedConfig):
r""" r"""
Configuration class for Bart. Parameters are renamed from the fairseq implementation Configuration class for Bart. Parameters are renamed from the fairseq implementation
""" """
model_type = "bart" model_type = "bart"
...@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig): ...@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
**common_kwargs **common_kwargs
): ):
r""" r"""
:class:`~transformers.BartConfig` is the configuration class for `BartModel`. :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
Examples:: Examples::
>>> from transformers import BartConfig, BartModel >>> from transformers import BartConfig, BartModel
>>> config = BartConfig.from_pretrained('facebook/bart-large') >>> config = BartConfig.from_pretrained('facebook/bart-large')
>>> model = BartModel(config) >>> model = BartModel(config)
""" """
if "hidden_size" in common_kwargs: if "hidden_size" in common_kwargs:
......
...@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
It is used to instantiate an BERT model according to the specified arguments, defining the model It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture. the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the BERT model. Defines the different tokens that Vocabulary size of the BERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, optional, defaults to 768): hidden_size (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12): num_attention_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072): intermediate_size (:obj:`int`, optional, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to False): gradient_checkpointing (:obj:`bool`, optional, defaults to False):
If True, use gradient checkpointing to save memory at the expense of slower backward pass. If True, use gradient checkpointing to save memory at the expense of slower backward pass.
Example:: Example::
>>> from transformers import BertModel, BertConfig >>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration >>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig() >>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration >>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration) >>> model = BertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "bert" model_type = "bert"
......
...@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h ...@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`. This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce. the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 246534): vocab_size (:obj:`int`, optional, defaults to 246534):
Vocabulary size of the CTRL model. Defines the different tokens that Vocabulary size of the CTRL model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 256): n_positions (:obj:`int`, optional, defaults to 256):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 256): n_ctx (:obj:`int`, optional, defaults to 256):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280): n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192): dff (:obj:`int`, optional, defaults to 8192):
Dimensionality of the inner dimension of the FFN. Dimensionality of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48): n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16): n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example:: Example::
>>> from transformers import CTRLModel, CTRLConfig >>> from transformers import CTRLModel, CTRLConfig
>>> # Initializing a CTRL configuration >>> # Initializing a CTRL configuration
>>> configuration = CTRLConfig() >>> configuration = CTRLConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = CTRLModel(configuration) >>> model = CTRLModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "ctrl" model_type = "ctrl"
......
...@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture. the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings. Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6): n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12): n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768): dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
hidden_dim (:obj:`int`, optional, defaults to 3072): hidden_dim (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1): dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1): attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1): qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model The dropout probabilities used in the question answering model
:class:`~transformers.DistilBertForQuestionAnswering`. :class:`~transformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification and the multiple choice model The dropout probabilities used in the sequence classification and the multiple choice model
:class:`~transformers.DistilBertForSequenceClassification`. :class:`~transformers.DistilBertForSequenceClassification`.
Example:: Example::
>>> from transformers import DistilBertModel, DistilBertConfig >>> from transformers import DistilBertModel, DistilBertConfig
>>> # Initializing a DistilBERT configuration >>> # Initializing a DistilBERT configuration
>>> configuration = DistilBertConfig() >>> configuration = DistilBertConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = DistilBertModel(configuration) >>> model = DistilBertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "distilbert" model_type = "distilbert"
......
...@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DPRConfig(BertConfig): class DPRConfig(BertConfig):
r""" r"""
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
`DPRModel`. `DPRModel`.
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`. This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
It is used to instantiate the components of the DPR model. It is used to instantiate the components of the DPR model.
Args: Args:
projection_dim (:obj:`int`, optional, defaults to 0): projection_dim (:obj:`int`, optional, defaults to 0):
Dimension of the projection for the context and question encoders. Dimension of the projection for the context and question encoders.
If it is set to zero (default), then no projection is done. If it is set to zero (default), then no projection is done.
""" """
model_type = "dpr" model_type = "dpr"
......
...@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ElectraConfig(PretrainedConfig): class ElectraConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`. This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__ the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
architecture. architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the different tokens that Vocabulary size of the ELECTRA model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, optional, defaults to 256): hidden_size (:obj:`int`, optional, defaults to 256):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4): num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 1024): intermediate_size (:obj:`int`, optional, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
summary_type (:obj:`string`, optional, defaults to "first"): summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
'gelu' => add a gelu activation to the output, Other => no activation. 'gelu' => add a gelu activation to the output, Other => no activation.
summary_last_dropout (:obj:`float`, optional, defaults to 0.0): summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Add a dropout after the projection and activation Add a dropout after the projection and activation
Example:: Example::
>>> from transformers import ElectraModel, ElectraConfig >>> from transformers import ElectraModel, ElectraConfig
>>> # Initializing a ELECTRA electra-base-uncased style configuration >>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig() >>> configuration = ElectraConfig()
>>> # Initializing a model from the electra-base-uncased style configuration >>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration) >>> model = ElectraModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "electra" model_type = "electra"
......
...@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__) ...@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)
class EncoderDecoderConfig(PretrainedConfig): class EncoderDecoderConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`. :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs. It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs. and can be used to control the model outputs.
See the documentation for :class:`~transformers.PretrainedConfig` for more information. See the documentation for :class:`~transformers.PretrainedConfig` for more information.
Args: Args:
kwargs (`optional`): kwargs (`optional`):
Remaining dictionary of keyword arguments. Notably: Remaining dictionary of keyword arguments. Notably:
encoder (:class:`PretrainedConfig`, optional, defaults to `None`): encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the encoder config. An instance of a configuration object that defines the encoder config.
decoder (:class:`PretrainedConfig`, optional, defaults to `None`): decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the decoder config. An instance of a configuration object that defines the decoder config.
Example:: Example::
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
>>> # Initializing a BERT bert-base-uncased style configuration >>> # Initializing a BERT bert-base-uncased style configuration
>>> config_encoder = BertConfig() >>> config_encoder = BertConfig()
>>> config_decoder = BertConfig() >>> config_decoder = BertConfig()
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
>>> model = EncoderDecoderModel(config=config) >>> model = EncoderDecoderModel(config=config)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> config_encoder = model.config.encoder >>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder >>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm >>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True >>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True >>> config_decoder.add_cross_attention = True
>>> # Saving the model, including its configuration >>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model') >>> model.save_pretrained('my-model')
>>> # loading model and config from pretrained folder >>> # loading model and config from pretrained folder
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
""" """
model_type = "encoder_decoder" model_type = "encoder_decoder"
......
...@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FlaubertConfig(XLMConfig): class FlaubertConfig(XLMConfig):
""" """
Configuration class to store the configuration of a `FlaubertModel`. Configuration class to store the configuration of a `FlaubertModel`.
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`. This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture. the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply the layer normalization before or after the feed forward layer following the Whether to apply the layer normalization before or after the feed forward layer following the
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
layerdrop (:obj:`float`, `optional`, defaults to 0.0): layerdrop (:obj:`float`, `optional`, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020) with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145): vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the Flaubert model. Defines the different tokens that Vocabulary size of the Flaubert model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
emb_dim (:obj:`int`, optional, defaults to 2048): emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16): n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1): dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler. layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1): attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for the attention mechanism The dropout probability for the attention mechanism
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
The non-linear activation function (function or string) in the The non-linear activation function (function or string) in the
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
causal (:obj:`boolean`, optional, defaults to :obj:`False`): causal (:obj:`boolean`, optional, defaults to :obj:`False`):
Set this to `True` for the model to behave in a causal manner. Set this to `True` for the model to behave in a causal manner.
Causal models use a triangular attention mask in order to only attend to the left-side context instead Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context. if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`): asm (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer. layer.
n_langs (:obj:`int`, optional, defaults to 1): n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models. The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them. for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048). (e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices. initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257): init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices. initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0): bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary. The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1): eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary. The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2): pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary. The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3): unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary. The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5): mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary. The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"): summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5): start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet. Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5): end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet. Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0): mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context. Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1): lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating The ID of the language used by the model. This parameter is used when generating
text in a given language. text in a given language.
""" """
model_type = "flaubert" model_type = "flaubert"
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
"""Constructs FlaubertConfig. """Constructs FlaubertConfig."""
"""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.pre_norm = pre_norm self.pre_norm = pre_norm
...@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`. This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture. the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 50257): vocab_size (:obj:`int`, optional, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the different tokens that Vocabulary size of the GPT-2 model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
n_positions (:obj:`int`, optional, defaults to 1024): n_positions (:obj:`int`, optional, defaults to 1024):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024): n_ctx (:obj:`int`, optional, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768): n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12): n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
n_inner (:obj:`int`, optional, defaults to None): n_inner (:obj:`int`, optional, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, optional, defaults to 'gelu'): activation_function (:obj:`str`, optional, defaults to 'gelu'):
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"): summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
Example:: Example::
>>> from transformers import GPT2Model, GPT2Config >>> from transformers import GPT2Model, GPT2Config
>>> # Initializing a GPT2 configuration >>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config() >>> configuration = GPT2Config()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration) >>> model = GPT2Model(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gpt2" model_type = "gpt2"
......
...@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LongformerConfig(RobertaConfig): class LongformerConfig(RobertaConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
It is used to instantiate an Longformer model according to the specified arguments, defining the model It is used to instantiate an Longformer model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096. the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
It reuses the same defaults. Please check the parent class for more information. It reuses the same defaults. Please check the parent class for more information.
Args: Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512): attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers. Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where To specify a different window size for each layer, use a :obj:`List[int]` where
``len(attention_window) == num_hidden_layers``. ``len(attention_window) == num_hidden_layers``.
Example:: Example::
>>> from transformers import LongformerConfig, LongformerModel >>> from transformers import LongformerConfig, LongformerModel
>>> # Initializing a Longformer configuration >>> # Initializing a Longformer configuration
>>> configuration = LongformerConfig() >>> configuration = LongformerConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = LongformerModel(configuration) >>> model = LongformerModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "longformer" model_type = "longformer"
......
...@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MobileBertConfig(PretrainedConfig): class MobileBertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`. This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
architecture. architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the MobileBERT model. Defines the different tokens that Vocabulary size of the MobileBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
hidden_size (:obj:`int`, optional, defaults to 512): hidden_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 24): num_hidden_layers (:obj:`int`, optional, defaults to 24):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4): num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 512): intermediate_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
pad_token_id (:obj:`int`, optional, defaults to 0): pad_token_id (:obj:`int`, optional, defaults to 0):
The ID of the token in the word embedding to use as padding. The ID of the token in the word embedding to use as padding.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
The dimension of the word embedding vectors. The dimension of the word embedding vectors.
trigram_input (:obj:`bool`, optional, defaults to True): trigram_input (:obj:`bool`, optional, defaults to True):
Use a convolution of trigram as input. Use a convolution of trigram as input.
use_bottleneck (:obj:`bool`, optional, defaults to True): use_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use bottleneck in BERT. Whether to use bottleneck in BERT.
intra_bottleneck_size (:obj:`int`, optional, defaults to 128): intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
Size of bottleneck layer output. Size of bottleneck layer output.
use_bottleneck_attention (:obj:`bool`, optional, defaults to False): use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
Whether to use attention inputs from the bottleneck transformation. Whether to use attention inputs from the bottleneck transformation.
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True): key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use the same linear transformation for query&key in the bottleneck. Whether to use the same linear transformation for query&key in the bottleneck.
num_feedforward_networks (:obj:`int`, optional, defaults to 4): num_feedforward_networks (:obj:`int`, optional, defaults to 4):
Number of FFNs in a block. Number of FFNs in a block.
normalization_type (:obj:`str`, optional, defaults to "no_norm"): normalization_type (:obj:`str`, optional, defaults to "no_norm"):
The normalization type in BERT. The normalization type in BERT.
Example: Example:
>>> from transformers import MobileBertModel, MobileBertConfig >>> from transformers import MobileBertModel, MobileBertConfig
>>> # Initializing a MobileBERT configuration >>> # Initializing a MobileBERT configuration
>>> configuration = MobileBertConfig() >>> configuration = MobileBertConfig()
>>> # Initializing a model from the configuration above >>> # Initializing a model from the configuration above
>>> model = MobileBertModel(configuration) >>> model = MobileBertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
Attributes: Attributes:
pretrained_config_archive_map (Dict[str, str]): pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints. A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "mobilebert" model_type = "mobilebert"
......
...@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`. This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI. the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 40478): vocab_size (:obj:`int`, optional, defaults to 40478):
Vocabulary size of the GPT model. Defines the different tokens that Vocabulary size of the GPT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 512): n_positions (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 512): n_ctx (:obj:`int`, optional, defaults to 512):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768): n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12): n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head. Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"): summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
Example:: Example::
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> # Initializing a GPT configuration >>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig() >>> configuration = OpenAIGPTConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration) >>> model = OpenAIGPTModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "openai-gpt" model_type = "openai-gpt"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment