Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
......@@ -90,11 +90,11 @@ class TokenClassificationTask:
sequence_a_segment_id=0,
mask_padding_with_zero=True,
) -> List[InputFeatures]:
""" Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
"""Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
"""
# TODO clean up all this to leverage built-in features of tokenizers
......@@ -230,7 +230,8 @@ if is_torch_available():
):
# Load data features from cache or dataset file
cached_features_file = os.path.join(
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
data_dir,
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
)
# Make sure only the first process in distributed training processes the dataset,
......
......@@ -14,18 +14,18 @@ def swish(x):
def _gelu_python(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
This is now written in C in torch.nn.functional
Also see https://arxiv.org/abs/1606.08415
"""Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
This is now written in C in torch.nn.functional
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
......
......@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
# run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat(
func, repeat=1, number=5,
func,
repeat=1,
number=5,
)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
import torch_xla.debug.metrics as met
......
......@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
@dataclass
class TensorFlowBenchmarkArguments(BenchmarkArguments):
tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"},
default=None,
metadata={"help": "Name of TPU"},
)
device_idx: int = field(
default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
default=0,
metadata={"help": "CPU / GPU device index. Defaults to 0."},
)
eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
use_xla: bool = field(
......
......@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
timeit.repeat(func, repeat=1, number=5)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
return min(runtimes) / 10.0
except ResourceExhaustedError as e:
......
......@@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(
def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
"""
This function wraps another function into its own separated process.
In order to ensure accurate memory measurements it is important that the function
is executed in a separate process
Args:
- `func`: (`callable`): function() -> ...
generic function which will be executed in its own separate process
- `do_multi_processing`: (`bool`)
Whether to run function on separate process or not
This function wraps another function into its own separated process.
In order to ensure accurate memory measurements it is important that the function
is executed in a separate process
Args:
- `func`: (`callable`): function() -> ...
generic function which will be executed in its own separate process
- `do_multi_processing`: (`bool`)
Whether to run function on separate process or not
"""
def multi_process_func(*args, **kwargs):
......@@ -106,13 +106,13 @@ def is_memory_tracing_enabled():
class Frame(NamedTuple):
""" `Frame` is a NamedTuple used to gather the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""`Frame` is a NamedTuple used to gather the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""
filename: str
......@@ -123,10 +123,10 @@ class Frame(NamedTuple):
class UsedMemoryState(NamedTuple):
""" `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
"""`UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
"""
frame: Frame
......@@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):
class Memory(NamedTuple):
""" `Memory` NamedTuple have a single field `bytes` and
you can get a human readable str of the number of mega bytes by calling `__repr__`
- `byte` (integer): number of bytes,
"""`Memory` NamedTuple have a single field `bytes` and
you can get a human readable str of the number of mega bytes by calling `__repr__`
- `byte` (integer): number of bytes,
"""
bytes: int
......@@ -147,11 +147,11 @@ class Memory(NamedTuple):
class MemoryState(NamedTuple):
""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""
frame: Frame
......@@ -161,14 +161,14 @@ class MemoryState(NamedTuple):
class MemorySummary(NamedTuple):
""" `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
"""`MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
"""
sequential: List[MemoryState]
......@@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]
def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
"""
measures peak cpu memory consumption of a given `function`
running the function for at least interval seconds
and at most 20 * interval seconds.
This function is heavily inspired by: `memory_usage`
of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
measures peak cpu memory consumption of a given `function`
running the function for at least interval seconds
and at most 20 * interval seconds.
This function is heavily inspired by: `memory_usage`
of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
Args:
- `function`: (`callable`): function() -> ...
function without any arguments to measure for which to measure the peak memory
Args:
- `function`: (`callable`): function() -> ...
function without any arguments to measure for which to measure the peak memory
- `interval`: (`float`, `optional`, defaults to `0.5`)
interval in second for which to measure the memory usage
- `interval`: (`float`, `optional`, defaults to `0.5`)
interval in second for which to measure the memory usage
- `device_idx`: (`int`, `optional`, defaults to `None`)
device id for which to measure gpu usage
- `device_idx`: (`int`, `optional`, defaults to `None`)
device id for which to measure gpu usage
Returns:
- `max_memory`: (`int`)
cosumed memory peak in Bytes
Returns:
- `max_memory`: (`int`)
cosumed memory peak in Bytes
"""
def get_cpu_memory(process_id: int) -> int:
"""
measures current cpu memory usage of a given `process_id`
measures current cpu memory usage of a given `process_id`
Args:
- `process_id`: (`int`)
process_id for which to measure memory
Args:
- `process_id`: (`int`)
process_id for which to measure memory
Returns
- `memory`: (`int`)
cosumed memory in Bytes
Returns
- `memory`: (`int`)
cosumed memory in Bytes
"""
process = psutil.Process(process_id)
try:
......@@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
class MemoryMeasureProcess(Process):
"""
`MemoryMeasureProcess` inherits from `Process` and overwrites
its `run()` method. Used to measure the memory usage of a process
`MemoryMeasureProcess` inherits from `Process` and overwrites
its `run()` method. Used to measure the memory usage of a process
"""
def __init__(self, process_id: int, child_connection: Connection, interval: float):
......@@ -309,37 +309,37 @@ def start_memory_tracing(
events_to_trace: str = "line",
gpus_to_trace: Optional[List[int]] = None,
) -> MemoryTrace:
""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
See `./benchmark.py` for usage examples.
Current memory consumption is returned using psutil and in particular is the RSS memory
"Resident Set Size” (the non-swapped physical memory the process is using).
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
Args:
- `modules_to_trace`: (None, string, list/tuple of string)
if None, all events are recorded
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
- `modules_not_to_trace`: (None, string, list/tuple of string)
if None, no module is avoided
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
default to line
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
Return:
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
- `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
See `./benchmark.py` for usage examples.
Current memory consumption is returned using psutil and in particular is the RSS memory
"Resident Set Size” (the non-swapped physical memory the process is using).
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
Args:
- `modules_to_trace`: (None, string, list/tuple of string)
if None, all events are recorded
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
- `modules_not_to_trace`: (None, string, list/tuple of string)
if None, no module is avoided
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
default to line
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
Return:
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
- `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""
if is_psutil_available():
......@@ -371,8 +371,8 @@ def start_memory_tracing(
memory_trace = []
def traceit(frame, event, args):
""" Tracing method executed before running each line in a module or sub-module
Record memory allocated in a list with debugging information
"""Tracing method executed before running each line in a module or sub-module
Record memory allocated in a list with debugging information
"""
global _is_memory_tracing_enabled
......@@ -456,39 +456,39 @@ def start_memory_tracing(
def stop_memory_tracing(
memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
) -> Optional[MemorySummary]:
""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
Args:
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
Return:
- None if `memory_trace` is None
- `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
`Memory` named tuple have fields
- `byte` (integer): number of bytes,
- `string` (string): same as human readable string (ex: "3.5MB")
`Frame` are namedtuple used to list the current frame state and have the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
Args:
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
Return:
- None if `memory_trace` is None
- `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeated memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
`Memory` named tuple have fields
- `byte` (integer): number of bytes,
- `string` (string): same as human readable string (ex: "3.5MB")
`Frame` are namedtuple used to list the current frame state and have the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""
global _is_memory_tracing_enabled
_is_memory_tracing_enabled = False
......@@ -499,15 +499,19 @@ def stop_memory_tracing(
cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
memory_trace[:-1], memory_trace[1:]
):
for (
(frame, cpu_mem, gpu_mem),
(next_frame, next_cpu_mem, next_gpu_mem),
) in zip(memory_trace[:-1], memory_trace[1:]):
cpu_mem_inc = next_cpu_mem - cpu_mem
gpu_mem_inc = next_gpu_mem - gpu_mem
cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
memory_diff_trace.append(
MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
frame=frame,
cpu=Memory(cpu_mem_inc),
gpu=Memory(gpu_mem_inc),
cpu_gpu=Memory(cpu_gpu_mem_inc),
)
)
......@@ -529,7 +533,10 @@ def stop_memory_tracing(
) # order by the total CPU + GPU memory increase
cumulative_memory = list(
MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
frame=frame,
cpu=Memory(cpu_mem_inc),
gpu=Memory(gpu_mem_inc),
cpu_gpu=Memory(cpu_gpu_mem_inc),
)
for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
)
......@@ -544,15 +551,17 @@ def stop_memory_tracing(
total_memory = Memory(total_memory)
return MemorySummary(
sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
sequential=memory_diff_trace,
cumulative=cumulative_memory,
current=memory_curr_trace,
total=total_memory,
)
return None
def bytes_to_mega_bytes(memory_amount: int) -> int:
""" Utility to convert a number of bytes (int) into a number of mega bytes (int)
"""
"""Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
return memory_amount >> 20
......
......@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for attached classifiers.
Example::
>>> from transformers import AlbertConfig, AlbertModel
>>> # Initializing an ALBERT-xxlarge style configuration
>>> albert_xxlarge_configuration = AlbertConfig()
>>> # Initializing an ALBERT-base style configuration
>>> albert_base_configuration = AlbertConfig(
... hidden_size=768,
... num_attention_heads=12,
... intermediate_size=3072,
... )
>>> # Initializing a model from the ALBERT-base style configuration
>>> model = AlbertModel(albert_xxlarge_configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for attached classifiers.
Example::
>>> from transformers import AlbertConfig, AlbertModel
>>> # Initializing an ALBERT-xxlarge style configuration
>>> albert_xxlarge_configuration = AlbertConfig()
>>> # Initializing an ALBERT-base style configuration
>>> albert_base_configuration = AlbertConfig(
... hidden_size=768,
... num_attention_heads=12,
... intermediate_size=3072,
... )
>>> # Initializing a model from the ALBERT-base style configuration
>>> model = AlbertModel(albert_xxlarge_configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "albert"
......
......@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
CONFIG_MAPPING = OrderedDict(
[
("retribert", RetriBertConfig,),
("t5", T5Config,),
("mobilebert", MobileBertConfig,),
("distilbert", DistilBertConfig,),
("albert", AlbertConfig,),
("camembert", CamembertConfig,),
("xlm-roberta", XLMRobertaConfig,),
(
"retribert",
RetriBertConfig,
),
(
"t5",
T5Config,
),
(
"mobilebert",
MobileBertConfig,
),
(
"distilbert",
DistilBertConfig,
),
(
"albert",
AlbertConfig,
),
(
"camembert",
CamembertConfig,
),
(
"xlm-roberta",
XLMRobertaConfig,
),
("pegasus", PegasusConfig),
("marian", MarianConfig,),
("mbart", MBartConfig,),
("bart", BartConfig,),
("reformer", ReformerConfig,),
("longformer", LongformerConfig,),
("roberta", RobertaConfig,),
("flaubert", FlaubertConfig,),
("bert", BertConfig,),
("openai-gpt", OpenAIGPTConfig,),
("gpt2", GPT2Config,),
("transfo-xl", TransfoXLConfig,),
("xlnet", XLNetConfig,),
("xlm", XLMConfig,),
("ctrl", CTRLConfig,),
("electra", ElectraConfig,),
("encoder-decoder", EncoderDecoderConfig,),
(
"marian",
MarianConfig,
),
(
"mbart",
MBartConfig,
),
(
"bart",
BartConfig,
),
(
"reformer",
ReformerConfig,
),
(
"longformer",
LongformerConfig,
),
(
"roberta",
RobertaConfig,
),
(
"flaubert",
FlaubertConfig,
),
(
"bert",
BertConfig,
),
(
"openai-gpt",
OpenAIGPTConfig,
),
(
"gpt2",
GPT2Config,
),
(
"transfo-xl",
TransfoXLConfig,
),
(
"xlnet",
XLNetConfig,
),
(
"xlm",
XLMConfig,
),
(
"ctrl",
CTRLConfig,
),
(
"electra",
ElectraConfig,
),
(
"encoder-decoder",
EncoderDecoderConfig,
),
]
)
class AutoConfig:
r"""
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
"""
def __init__(self):
......
......@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
class BartConfig(PretrainedConfig):
r"""
Configuration class for Bart. Parameters are renamed from the fairseq implementation
Configuration class for Bart. Parameters are renamed from the fairseq implementation
"""
model_type = "bart"
......@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
**common_kwargs
):
r"""
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
Examples::
Examples::
>>> from transformers import BartConfig, BartModel
>>> from transformers import BartConfig, BartModel
>>> config = BartConfig.from_pretrained('facebook/bart-large')
>>> model = BartModel(config)
>>> config = BartConfig.from_pretrained('facebook/bart-large')
>>> model = BartModel(config)
"""
if "hidden_size" in common_kwargs:
......
......@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the BERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to False):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
Example::
>>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the BERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to False):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
Example::
>>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "bert"
......
......@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
class CTRLConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 246534):
Vocabulary size of the CTRL model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 256):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 256):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192):
Dimensionality of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example::
>>> from transformers import CTRLModel, CTRLConfig
>>> # Initializing a CTRL configuration
>>> configuration = CTRLConfig()
>>> # Initializing a model from the configuration
>>> model = CTRLModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 246534):
Vocabulary size of the CTRL model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 256):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 256):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192):
Dimensionality of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example::
>>> from transformers import CTRLModel, CTRLConfig
>>> # Initializing a CTRL configuration
>>> configuration = CTRLConfig()
>>> # Initializing a model from the configuration
>>> model = CTRLModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "ctrl"
......
......@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
hidden_dim (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~transformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification and the multiple choice model
:class:`~transformers.DistilBertForSequenceClassification`.
Example::
>>> from transformers import DistilBertModel, DistilBertConfig
>>> # Initializing a DistilBERT configuration
>>> configuration = DistilBertConfig()
>>> # Initializing a model from the configuration
>>> model = DistilBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
hidden_dim (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~transformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification and the multiple choice model
:class:`~transformers.DistilBertForSequenceClassification`.
Example::
>>> from transformers import DistilBertModel, DistilBertConfig
>>> # Initializing a DistilBERT configuration
>>> configuration = DistilBertConfig()
>>> # Initializing a model from the configuration
>>> model = DistilBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "distilbert"
......
......@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DPRConfig(BertConfig):
r"""
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
`DPRModel`.
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
`DPRModel`.
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
It is used to instantiate the components of the DPR model.
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
It is used to instantiate the components of the DPR model.
Args:
projection_dim (:obj:`int`, optional, defaults to 0):
Dimension of the projection for the context and question encoders.
If it is set to zero (default), then no projection is done.
Args:
projection_dim (:obj:`int`, optional, defaults to 0):
Dimension of the projection for the context and question encoders.
If it is set to zero (default), then no projection is done.
"""
model_type = "dpr"
......
......@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ElectraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, optional, defaults to 256):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
'gelu' => add a gelu activation to the output, Other => no activation.
summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Add a dropout after the projection and activation
Example::
>>> from transformers import ElectraModel, ElectraConfig
>>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig()
>>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, optional, defaults to 256):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
'gelu' => add a gelu activation to the output, Other => no activation.
summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`.
Add a dropout after the projection and activation
Example::
>>> from transformers import ElectraModel, ElectraConfig
>>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig()
>>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "electra"
......
......@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)
class EncoderDecoderConfig(PretrainedConfig):
r"""
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs.
See the documentation for :class:`~transformers.PretrainedConfig` for more information.
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs.
See the documentation for :class:`~transformers.PretrainedConfig` for more information.
Args:
kwargs (`optional`):
Remaining dictionary of keyword arguments. Notably:
encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the encoder config.
decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the decoder config.
Args:
kwargs (`optional`):
Remaining dictionary of keyword arguments. Notably:
encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the encoder config.
decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the decoder config.
Example::
Example::
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
>>> # Initializing a BERT bert-base-uncased style configuration
>>> config_encoder = BertConfig()
>>> config_decoder = BertConfig()
>>> # Initializing a BERT bert-base-uncased style configuration
>>> config_encoder = BertConfig()
>>> config_decoder = BertConfig()
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
>>> model = EncoderDecoderModel(config=config)
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
>>> model = EncoderDecoderModel(config=config)
>>> # Accessing the model configuration
>>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True
>>> # Accessing the model configuration
>>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True
>>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model')
>>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model')
>>> # loading model and config from pretrained folder
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
>>> # loading model and config from pretrained folder
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
"""
model_type = "encoder_decoder"
......
......@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FlaubertConfig(XLMConfig):
"""
Configuration class to store the configuration of a `FlaubertModel`.
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Configuration class to store the configuration of a `FlaubertModel`.
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply the layer normalization before or after the feed forward layer following the
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the Flaubert model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for the attention mechanism
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
The non-linear activation function (function or string) in the
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
Set this to `True` for the model to behave in a causal manner.
Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer.
n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Is one of the following options:
Args:
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply the layer normalization before or after the feed forward layer following the
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the Flaubert model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for the attention mechanism
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
The non-linear activation function (function or string) in the
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
Set this to `True` for the model to behave in a causal manner.
Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer.
n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
"""
model_type = "flaubert"
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
"""Constructs FlaubertConfig.
"""
"""Constructs FlaubertConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.layerdrop = layerdrop
self.pre_norm = pre_norm
......@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig):
"""
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
n_positions (:obj:`int`, optional, defaults to 1024):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
n_inner (:obj:`int`, optional, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, optional, defaults to 'gelu'):
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
>>> from transformers import GPT2Model, GPT2Config
>>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config()
>>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
n_positions (:obj:`int`, optional, defaults to 1024):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
n_inner (:obj:`int`, optional, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, optional, defaults to 'gelu'):
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
>>> from transformers import GPT2Model, GPT2Config
>>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config()
>>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "gpt2"
......
......@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LongformerConfig(RobertaConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
It is used to instantiate an Longformer model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
It is used to instantiate an Longformer model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
It reuses the same defaults. Please check the parent class for more information.
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
It reuses the same defaults. Please check the parent class for more information.
Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where
``len(attention_window) == num_hidden_layers``.
Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where
``len(attention_window) == num_hidden_layers``.
Example::
Example::
>>> from transformers import LongformerConfig, LongformerModel
>>> from transformers import LongformerConfig, LongformerModel
>>> # Initializing a Longformer configuration
>>> configuration = LongformerConfig()
>>> # Initializing a Longformer configuration
>>> configuration = LongformerConfig()
>>> # Initializing a model from the configuration
>>> model = LongformerModel(configuration)
>>> # Initializing a model from the configuration
>>> model = LongformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "longformer"
......
......@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MobileBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the MobileBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
hidden_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
pad_token_id (:obj:`int`, optional, defaults to 0):
The ID of the token in the word embedding to use as padding.
embedding_size (:obj:`int`, optional, defaults to 128):
The dimension of the word embedding vectors.
trigram_input (:obj:`bool`, optional, defaults to True):
Use a convolution of trigram as input.
use_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use bottleneck in BERT.
intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
Size of bottleneck layer output.
use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
Whether to use attention inputs from the bottleneck transformation.
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use the same linear transformation for query&key in the bottleneck.
num_feedforward_networks (:obj:`int`, optional, defaults to 4):
Number of FFNs in a block.
normalization_type (:obj:`str`, optional, defaults to "no_norm"):
The normalization type in BERT.
Example:
>>> from transformers import MobileBertModel, MobileBertConfig
>>> # Initializing a MobileBERT configuration
>>> configuration = MobileBertConfig()
>>> # Initializing a model from the configuration above
>>> model = MobileBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the MobileBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
hidden_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
pad_token_id (:obj:`int`, optional, defaults to 0):
The ID of the token in the word embedding to use as padding.
embedding_size (:obj:`int`, optional, defaults to 128):
The dimension of the word embedding vectors.
trigram_input (:obj:`bool`, optional, defaults to True):
Use a convolution of trigram as input.
use_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use bottleneck in BERT.
intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
Size of bottleneck layer output.
use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
Whether to use attention inputs from the bottleneck transformation.
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use the same linear transformation for query&key in the bottleneck.
num_feedforward_networks (:obj:`int`, optional, defaults to 4):
Number of FFNs in a block.
normalization_type (:obj:`str`, optional, defaults to "no_norm"):
The normalization type in BERT.
Example:
>>> from transformers import MobileBertModel, MobileBertConfig
>>> # Initializing a MobileBERT configuration
>>> configuration = MobileBertConfig()
>>> # Initializing a model from the configuration above
>>> model = MobileBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "mobilebert"
......
......@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 40478):
Vocabulary size of the GPT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 512):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig()
>>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 40478):
Vocabulary size of the GPT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 512):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig()
>>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "openai-gpt"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment