Unverified Commit 08f534d2 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styling (#8067)

* Important files

* Styling them all

* Revert "Styling them all"

This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e.

* Syling them for realsies

* Fix syntax error

* Fix benchmark_utils

* More fixes

* Fix modeling auto and script

* Remove new line

* Fixes

* More fixes

* Fix more files

* Style

* Add FSMT

* More fixes

* More fixes

* More fixes

* More fixes

* Fixes

* More fixes

* More fixes

* Last fixes

* Make sphinx happy
parent 04a17f85
......@@ -11,21 +11,22 @@ from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrained
InputDataClass = NewType("InputDataClass", Any)
"""
A DataCollator is a function that takes a list of samples from a Dataset
and collate them into a batch, as a dictionary of Tensors.
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of Tensors.
"""
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])
def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
"""
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for potential keys named:
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for
potential keys named:
- ``label``: handles a single value (int or float) per object
- ``label_ids``: handles a list of values per object
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to the model.
See glue and ner for example of how it's useful.
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to
the model. See glue and ner for example of how it's useful.
"""
# In this function we'll make the assumption that all `features` in the batch
......@@ -73,11 +74,11 @@ class DataCollatorWithPadding:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
......@@ -87,8 +88,8 @@ class DataCollatorWithPadding:
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta).
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
"""
tokenizer: PreTrainedTokenizerBase
......@@ -117,6 +118,7 @@ class DataCollatorWithPadding:
class DataCollatorForLanguageModeling:
"""
Data collator used for language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
"""
......@@ -198,6 +200,7 @@ class DataCollatorForLanguageModeling:
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
"""
Data collator used for language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
"""
......@@ -275,8 +278,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
if self.tokenizer.mask_token is None:
......@@ -316,6 +319,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
class DataCollatorForSOP(DataCollatorForLanguageModeling):
"""
Data collator used for sentence order prediction task.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for both masked language modeling and sentence order prediction
"""
......@@ -342,8 +346,8 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% original.
N-gram not applied yet.
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
original. N-gram not applied yet.
"""
if self.tokenizer.mask_token is None:
raise ValueError(
......@@ -385,6 +389,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
class DataCollatorForPermutationLanguageModeling:
"""
Data collator used for permutation language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for permutation language modeling with procedures specific to XLNet
"""
......@@ -425,10 +430,14 @@ class DataCollatorForPermutationLanguageModeling:
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in the sequence to be processed), repeat from Step 1.
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
if self.tokenizer.mask_token is None:
......@@ -517,8 +526,7 @@ class DataCollatorForPermutationLanguageModeling:
@dataclass
class DataCollatorForNextSentencePrediction:
"""
Data collator used for next sentence prediction.
- collates examples which contains pre-generated negative examples
Data collator used for next sentence prediction. - collates examples which contains pre-generated negative examples
- preprocesses batches for masked language modeling
"""
......@@ -531,9 +539,12 @@ class DataCollatorForNextSentencePrediction:
def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
"""
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will not generate any negative examples.
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will
not generate any negative examples
Args:
examples (:obj:`List[Dict]`): Each dictionary should have the following keys:
- ``tokens_a``: A sequence of tokens, which should appear before ``tokens_b`` in the text.
- ``tokens_b``: A sequence of tokens, which should appear after ``tokens_a`` in the text.
- ``is_random_next``: 1 if this pair is generated randomly, else 0.
......
......@@ -23,9 +23,8 @@ class GlueDataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
line.
"""
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
......@@ -55,8 +54,7 @@ class Split(Enum):
class GlueDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
args: GlueDataTrainingArguments
......
......@@ -19,8 +19,7 @@ logger = logging.get_logger(__name__)
class TextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
......@@ -91,8 +90,7 @@ class TextDataset(Dataset):
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
......@@ -118,8 +116,7 @@ class LineByLineTextDataset(Dataset):
class LineByLineWithRefDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
......@@ -294,8 +291,7 @@ class LineByLineWithSOPTextDataset(Dataset):
class TextDatasetForNextSentencePrediction(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
......
......@@ -86,8 +86,7 @@ class Split(Enum):
class SquadDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
args: SquadDataTrainingArguments
......
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
"""
Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
probability that a question is unanswerable.
"""
......@@ -589,8 +589,9 @@ def compute_predictions_log_probs(
tokenizer,
verbose_logging,
):
"""XLNet write prediction logic (more complex than Bert's).
Write final predictions to the json file and log-odds of null if needed.
"""
XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
null if needed.
Requires utils_squad_evaluate.py
"""
......
......@@ -52,9 +52,9 @@ def glue_convert_examples_to_features(
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
``InputFeatures`` which can be fed to the model.
"""
if is_tf_available() and isinstance(examples, tf.data.Dataset):
......
......@@ -314,8 +314,8 @@ def squad_convert_examples_to_features(
tqdm_enabled=True,
):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Converts a list of examples into a list of features that can be directly given as input to a model. It is
model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
......@@ -326,8 +326,7 @@ def squad_convert_examples_to_features(
is_training: whether to create features for model evaluation or model training.
padding_strategy: Default to "max_length". Which padding strategy to use
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
threads: multiple processing threadsa-smi
......@@ -528,8 +527,8 @@ def squad_convert_examples_to_features(
class SquadProcessor(DataProcessor):
"""
Processor for the SQuAD data set.
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
version 2.0 of SQuAD, respectively.
"""
train_file = None
......@@ -745,9 +744,9 @@ class SquadExample:
class SquadFeatures:
"""
Single squad example features to be fed to a model.
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
:class:`~transformers.data.processors.squad.SquadExample` using the
:method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
......
......@@ -55,14 +55,13 @@ class InputExample:
@dataclass(frozen=True)
class InputFeatures:
"""
A single set of features of data.
Property names are the same names as the corresponding inputs to a model.
A single set of features of data. Property names are the same names as the corresponding inputs to a model.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
tokens.
token_type_ids: (Optional) Segment token indices to indicate first and second
portions of the inputs. Only some models use them.
label: (Optional) Label corresponding to the input. Int for classification problems,
......@@ -83,7 +82,8 @@ class DataProcessor:
"""Base class for data converters for sequence classification data sets."""
def get_example_from_tensor_dict(self, tensor_dict):
"""Gets an example from a dict with tensorflow tensors.
"""
Gets an example from a dict with tensorflow tensors.
Args:
tensor_dict: Keys and values should match the corresponding Glue
......@@ -108,8 +108,10 @@ class DataProcessor:
raise NotImplementedError()
def tfds_map(self, example):
"""Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
This method converts examples to the correct format."""
"""
Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
examples to the correct format.
"""
if len(self.get_labels()) > 1:
example.label = self.get_labels()[int(example.label)]
return example
......@@ -253,9 +255,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
actual values)
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
``InputFeatures`` which can be fed to the model.
"""
if max_length is None:
......
......@@ -26,8 +26,10 @@ logger = logging.get_logger(__name__)
class XnliProcessor(DataProcessor):
"""Processor for the XNLI dataset.
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
"""
Processor for the XNLI dataset. Adapted from
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
"""
def __init__(self, language, train_language=None):
self.language = language
......
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at
https://github.com/allenai/allennlp Copyright by the AllenNLP authors.
"""
import fnmatch
......@@ -433,10 +432,9 @@ def add_start_docstrings_to_callable(*docstr):
note = r"""
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:`Module` instance afterwards
instead of this since the former takes care of running the
pre and post processing steps while the latter silently ignores them.
Although the recipe for forward pass needs to be defined within this function, one should call the
:class:`Module` instance afterwards instead of this since the former takes care of running the pre and post
processing steps while the latter silently ignores them.
"""
fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
return fn
......@@ -454,20 +452,18 @@ def add_end_docstrings(*docstr):
PT_RETURN_INTRODUCTION = r"""
Returns:
:class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`:
A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
tuple of :obj:`torch.FloatTensor` comprising various elements depending on the configuration
(:class:`~transformers.{config_class}`) and inputs.
:class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` (if
``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`torch.FloatTensor`
comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
"""
TF_RETURN_INTRODUCTION = r"""
Returns:
:class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`:
A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
tuple of :obj:`tf.Tensor` comprising various elements depending on the configuration
(:class:`~transformers.{config_class}`) and inputs.
:class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: A :class:`~{full_output_type}` (if
``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`tf.Tensor` comprising
various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
"""
......@@ -831,19 +827,16 @@ def is_remote_url(url_or_filename):
def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> str:
"""
Resolve a model identifier, and a file name, to a HF-hosted url
on either S3 or Cloudfront (a Content Delivery Network, or CDN).
Cloudfront is replicated over the globe so downloads are way faster
for the end user (and it also lowers our bandwidth costs). However, it
is more aggressively cached by default, so may not always reflect the
latest changes to the underlying file (default TTL is 24 hours).
In terms of client-side caching from this library, even though
Cloudfront relays the ETags from S3, using one or the other
(or switching from one to the other) will affect caching: cached files
are not shared between the two because the cached file's name contains
a hash of the url.
Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery
Network, or CDN).
Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes
to the underlying file (default TTL is 24 hours).
In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or
the other (or switching from one to the other) will affect caching: cached files are not shared between the two
because the cached file's name contains a hash of the url.
"""
endpoint = (
PRESET_MIRROR_DICT.get(mirror, mirror)
......@@ -861,12 +854,10 @@ def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> st
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
so that TF 2.0 can identify it as a HDF5 file
(see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
identify it as a HDF5 file (see
https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
"""
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
......@@ -885,8 +876,8 @@ def url_to_filename(url, etag=None):
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or
its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
......@@ -921,10 +912,10 @@ def cached_path(
local_files_only=False,
) -> Optional[str]:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
then return the path
Args:
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
......@@ -936,8 +927,8 @@ def cached_path(
re-extract the archive and overide the folder where it was extracted.
Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
Local path (string) otherwise
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
otherwise
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
......@@ -1045,12 +1036,12 @@ def get_from_cache(
local_files_only=False,
) -> Optional[str]:
"""
Given a URL, look for the corresponding file in the local cache.
If it's not there, download it. Then return the path to the cached file.
Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
path to the cached file.
Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
Local path (string) otherwise
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
otherwise
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
......@@ -1213,8 +1204,8 @@ def is_tensor(x):
class ModelOutput(OrderedDict):
"""
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a
regular python dictionary.
a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
python dictionary.
.. warning::
You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
......
......@@ -84,8 +84,8 @@ class TFGenerationMixin:
Parameters:
input_ids (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
The sequence used as a prompt for the generation. If :obj:`None` the method initializes
it as an empty :obj:`tf.Tensor` of shape :obj:`(1,)`.
The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
:obj:`tf.Tensor` of shape :obj:`(1,)`.
max_length (:obj:`int`, `optional`, defaults to 20):
The maximum length of the sequence to be generated.
min_length (:obj:`int`, `optional`, defaults to 10):
......@@ -141,9 +141,9 @@ class TFGenerationMixin:
Return:
:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
shorter if all batches finished early due to the :obj:`eos_token_id`.
:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size * num_return_sequences,
sequence_length)`: The generated sequences. The second dimension (sequence_length) is either equal to
:obj:`max_length` or shorter if all batches finished early due to the :obj:`eos_token_id`.
Examples::
......@@ -428,8 +428,9 @@ class TFGenerationMixin:
attention_mask,
use_cache,
):
"""Generate sequences for each example without beam search (num_beams == 1).
All returned sequence are generated independantly.
"""
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
independantly.
"""
# length of generated sentences / unfinished sentences
......@@ -976,7 +977,9 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
"""
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
Args:
logits: logits distribution shape (batch size, vocabulary size)
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
......@@ -1044,9 +1047,8 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
def sample_without_replacement(logits, num_samples):
"""
categorical sampling witouth replacement is currently not implemented
the gumbel-max trick will do for now
see https://github.com/tensorflow/tensorflow/issues/9260 for more info
categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
https://github.com/tensorflow/tensorflow/issues/9260 for more info
"""
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
_, indices = tf.nn.top_k(logits + z, num_samples)
......@@ -1094,8 +1096,8 @@ class BeamHypotheses(object):
def is_done(self, best_sum_logprobs, cur_len):
"""
If there are enough hypotheses and that none of the hypotheses being generated
can become better than the worst one in the heap, then we are done with this sentence.
If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
one in the heap, then we are done with this sentence.
"""
if len(self) < self.num_beams:
......
......@@ -150,8 +150,8 @@ class GenerationMixin:
Parameters:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
The sequence used as a prompt for the generation. If :obj:`None` the method initializes
it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`.
The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
:obj:`torch.LongTensor` of shape :obj:`(1,)`.
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
initial input_ids for the decoder of encoder-decoder type models. If :obj:`None` then only
decoder_start_token_id is passed as the first token to the decoder.
......@@ -210,9 +210,9 @@ class GenerationMixin:
Return:
:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
shorter if all batches finished early due to the :obj:`eos_token_id`.
:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
batches finished early due to the :obj:`eos_token_id`.
Examples::
......@@ -531,8 +531,9 @@ class GenerationMixin:
use_cache,
model_kwargs,
):
"""Generate sequences for each example without beam search (num_beams == 1).
All returned sequence are generated independantly.
"""
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
independantly.
"""
# length of generated sentences / unfinished sentences
unfinished_sents = input_ids.new(batch_size).fill_(1)
......@@ -935,8 +936,10 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
"""Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
"""
Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a list
of list of banned tokens to ban in the format [[batch index, vocabulary position],...
Args:
scores: logits distribution of shape (batch size, vocabulary size)
banned_tokens: list of list of tokens to ban of length (batch_size)
......@@ -965,7 +968,9 @@ def top_k_top_p_filtering(
filter_value: float = -float("Inf"),
min_tokens_to_keep: int = 1,
) -> Tensor:
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
"""
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
Args:
logits: logits distribution shape (batch size, vocabulary size)
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
......@@ -1033,8 +1038,8 @@ class BeamHypotheses(object):
def is_done(self, best_sum_logprobs, cur_len):
"""
If there are enough hypotheses and that none of the hypotheses being generated
can become better than the worst one in the heap, then we are done with this sentence.
If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
one in the heap, then we are done with this sentence.
"""
if len(self) < self.num_beams:
......
......@@ -104,11 +104,9 @@ class HfApi:
"""
Call HF API to sign in a user and get a token if credentials are valid.
Outputs:
token if credentials are valid
Outputs: token if credentials are valid
Throws:
requests.exceptions.HTTPError if credentials are invalid
Throws: requests.exceptions.HTTPError if credentials are invalid
"""
path = "{}/api/login".format(self.endpoint)
r = requests.post(path, json={"username": username, "password": password})
......@@ -152,8 +150,7 @@ class HfApi:
"""
Get a presigned url, then upload file to S3.
Outputs:
url: Read-only url for the stored file on S3.
Outputs: url: Read-only url for the stored file on S3.
"""
urls = self.presign(token, filename=filename, organization=organization)
# streaming upload:
......@@ -206,11 +203,10 @@ class HfApi:
class TqdmProgressFileReader:
"""
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
and override `f.read()` so as to display a tqdm progress bar.
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) and override `f.read()` so as to display a
tqdm progress bar.
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
for implementation details.
see github.com/huggingface/transformers/pull/2078#discussion_r354739608 for implementation details.
"""
def __init__(self, f: io.BufferedReader):
......@@ -254,8 +250,7 @@ class HfFolder:
@classmethod
def delete_token(cls):
"""
Delete token.
Do not fail if token does not exist.
Delete token. Do not fail if token does not exist.
"""
try:
os.remove(cls.path_token)
......
......@@ -13,12 +13,11 @@ DataClassType = NewType("DataClassType", Any)
class HfArgumentParser(ArgumentParser):
"""
This subclass of `argparse.ArgumentParser` uses type hints on dataclasses
to generate arguments.
This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments.
The class is designed to play well with the native argparse. In particular,
you can add more (non-dataclass backed) arguments to the parser after initialization
and you'll get the output back after parsing as an additional namespace.
The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed)
arguments to the parser after initialization and you'll get the output back after parsing as an additional
namespace.
"""
dataclass_types: Iterable[DataClassType]
......@@ -27,8 +26,7 @@ class HfArgumentParser(ArgumentParser):
"""
Args:
dataclass_types:
Dataclass type, or list of dataclass types for which we will "fill" instances
with the parsed args.
Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
kwargs:
(Optional) Passed to `argparse.ArgumentParser()` in the regular way.
"""
......@@ -94,33 +92,27 @@ class HfArgumentParser(ArgumentParser):
"""
Parse command-line args into instances of the specified dataclass types.
This relies on argparse's `ArgumentParser.parse_known_args`.
See the doc at:
This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
Args:
args:
List of strings to parse. The default is taken from sys.argv.
(same as argparse.ArgumentParser)
List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
return_remaining_strings:
If true, also return a list of remaining argument strings.
look_for_args_file:
If true, will look for a ".args" file with the same base name
as the entry point script for this process, and will append its
potential content to the command line args.
If true, will look for a ".args" file with the same base name as the entry point script for this
process, and will append its potential content to the command line args.
args_filename:
If not None, will uses this file instead of the ".args" file
specified in the previous argument.
If not None, will uses this file instead of the ".args" file specified in the previous argument.
Returns:
Tuple consisting of:
- the dataclass instances in the same order as they
were passed to the initializer.abspath
- if applicable, an additional namespace for more
(non-dataclass backed) arguments added to the parser
- the dataclass instances in the same order as they were passed to the initializer.abspath
- if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
after initialization.
- The potential list of remaining argument strings.
(same as argparse.ArgumentParser.parse_known_args)
- The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
"""
if args_filename or (look_for_args_file and len(sys.argv)):
if args_filename:
......@@ -155,8 +147,8 @@ class HfArgumentParser(ArgumentParser):
def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]:
"""
Alternative helper method that does not use `argparse` at all,
instead loading a json file and populating the dataclass types.
Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
dataclass types.
"""
data = json.loads(Path(json_file).read_text())
outputs = []
......@@ -169,8 +161,8 @@ class HfArgumentParser(ArgumentParser):
def parse_dict(self, args: dict) -> Tuple[DataClass, ...]:
"""
Alternative helper method that does not use `argparse` at all,
instead uses a dict and populating the dataclass types.
Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
types.
"""
outputs = []
for dtype in self.dataclass_types:
......
......@@ -298,8 +298,7 @@ class TensorBoardCallback(TrainerCallback):
class WandbCallback(TrainerCallback):
"""
A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases
<https://www.wandb.com/>`__.
A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases <https://www.wandb.com/>`__.
"""
def __init__(self):
......@@ -310,8 +309,8 @@ class WandbCallback(TrainerCallback):
"""
Setup the optional Weights & Biases (`wandb`) integration.
One can subclass and override this method to customize the setup if needed. Find more information
`here <https://docs.wandb.com/huggingface>`__. You can also override the following environment variables:
One can subclass and override this method to customize the setup if needed. Find more information `here
<https://docs.wandb.com/huggingface>`__. You can also override the following environment variables:
Environment:
WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`):
......@@ -368,8 +367,7 @@ class WandbCallback(TrainerCallback):
class CometCallback(TrainerCallback):
"""
A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML
<https://www.comet.ml/site/>`__.
A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML <https://www.comet.ml/site/>`__.
"""
def __init__(self):
......@@ -388,8 +386,8 @@ class CometCallback(TrainerCallback):
COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`):
Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE"
For a number of configurable items in the environment,
see `here <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
For a number of configurable items in the environment, see `here
<https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
"""
self._initialized = True
if state.is_world_process_zero:
......@@ -424,8 +422,7 @@ class CometCallback(TrainerCallback):
class MLflowCallback(TrainerCallback):
"""
A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow
<https://www.mlflow.org/>`__.
A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow <https://www.mlflow.org/>`__.
"""
MAX_LOG_SIZE = 100
......@@ -443,10 +440,9 @@ class MLflowCallback(TrainerCallback):
HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`):
Whether to use MLflow .log_artifact() facility to log artifacts.
This only makes sense if logging to a remote server, e.g. s3 or GCS.
If set to `True` or `1`, will copy whatever is in TrainerArgument's output_dir
to the local or remote artifact storage. Using it without a remote storage
will just copy the files to your artifact location.
This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy
whatever is in TrainerArgument's output_dir to the local or remote artifact storage. Using it without a
remote storage will just copy the files to your artifact location.
"""
log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
if log_artifacts in {"TRUE", "1"}:
......
......@@ -36,18 +36,14 @@ logger = logging.get_logger(__name__)
class ModelCard:
r"""Structured Model Card class.
Store model card as well as methods for loading/downloading/saving model cards.
r"""
Structured Model Card class. Store model card as well as methods for loading/downloading/saving model cards.
Please read the following paper for details and explanation on the sections:
"Model Cards for Model Reporting"
by Margaret Mitchell, Simone Wu,
Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
Link: https://arxiv.org/abs/1810.03993
Please read the following paper for details and explanation on the sections: "Model Cards for Model Reporting" by
Margaret Mitchell, Simone Wu, Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. Link: https://arxiv.org/abs/1810.03993
Note:
A model card can be loaded and saved to disk.
Note: A model card can be loaded and saved to disk.
Parameters:
"""
......@@ -85,37 +81,46 @@ class ModelCard:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
r"""
Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
Parameters:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a model card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.:
``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3,
e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a model card file saved using the
:func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
card should be cached if the standard cache should not be used.
Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache
should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
- The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
- The values in kwargs of any keys which are model card attributes will be used to override the loaded
values.
- Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the
`return_unused_kwargs` keyword parameter.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
find_from_standard_name: (`optional`) boolean, default True:
If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename.
Can be used to directly feed a model/config url and access the colocated modelcard.
If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them
with our standard modelcard filename. Can be used to directly feed a model/config url and access the
colocated modelcard.
return_unused_kwargs: (`optional`) bool:
- If False, then this function returns just the final model card object.
- If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
- If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a
dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of
kwargs which has not been used to update `ModelCard` and is otherwise ignored.
Examples::
......
......@@ -453,8 +453,9 @@ class AlbertTransformer(nn.Module):
class AlbertPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = AlbertConfig
......@@ -486,16 +487,16 @@ class AlbertForPreTrainingOutput(ModelOutput):
prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -514,14 +515,15 @@ ALBERT_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Args:
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
......@@ -529,35 +531,33 @@ ALBERT_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
See :meth:`transformers.PreTrainedTokenizer.__call__` and
:meth:`transformers.PreTrainedTokenizer.encode` for details.
Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``:
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
......@@ -615,17 +615,16 @@ class AlbertModel(AlbertPreTrainedModel):
return self.embeddings.word_embeddings
def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
is a total of 4 different layers.
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
while [2,3] correspond to the two inner groups of the second hidden layer.
Any layer with in index other than [0,1,2,3] will result in an error.
See base class PreTrainedModel for more information about head pruning
Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
information about head pruning
"""
for layer, heads in heads_to_prune.items():
group_idx = int(layer / self.config.inner_group_num)
......@@ -706,8 +705,10 @@ class AlbertModel(AlbertPreTrainedModel):
@add_start_docstrings(
"""Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
a `sentence order prediction (classification)` head. """,
"""
Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
""",
ALBERT_START_DOCSTRING,
)
class AlbertForPreTraining(AlbertPreTrainedModel):
......@@ -745,15 +746,13 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
):
r"""
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
Indices should be in ``[0, 1]``.
``0`` indicates original order (sequence A, then sequence B),
``1`` indicates switched order (sequence B, then sequence A).
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
......@@ -903,10 +902,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
labels in ``[0, ..., config.vocab_size]``
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
......@@ -952,8 +950,10 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
@add_start_docstrings(
"""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
"""
Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
ALBERT_START_DOCSTRING,
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
......@@ -989,9 +989,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1036,8 +1035,10 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
@add_start_docstrings(
"""Albert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ALBERT_START_DOCSTRING,
)
class AlbertForTokenClassification(AlbertPreTrainedModel):
......@@ -1076,8 +1077,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1123,8 +1124,10 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
@add_start_docstrings(
"""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ALBERT_START_DOCSTRING,
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
......@@ -1164,12 +1167,12 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1223,8 +1226,10 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
@add_start_docstrings(
"""Albert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ALBERT_START_DOCSTRING,
)
class AlbertForMultipleChoice(AlbertPreTrainedModel):
......@@ -1259,9 +1264,9 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -462,9 +462,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
The model class to instantiate is selected based on the :obj:`model_type` property of the config object
(either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
List options
......@@ -517,12 +517,10 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
messages.
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
......@@ -546,8 +544,8 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
class AutoModel:
r"""
This is a generic model class that will be instantiated as one of the base model classes of the library
when created with the :meth:`~transformers.AutoModel.from_pretrained` class method or the
This is a generic model class that will be instantiated as one of the base model classes of the library when
created with the :meth:`~transformers.AutoModel.from_pretrained` class method or the
:meth:`~transformers.AutoModel.from_config` class methods.
This class cannot be instantiated directly using ``__init__()`` (throws an error).
......@@ -567,9 +565,8 @@ class AutoModel:
Instantiates one of the base model classes of the library from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :meth:`~transformers.AutoModel.from_pretrained` to load
the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModel.from_pretrained` to load the model weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -661,9 +658,9 @@ class AutoModelForPreTraining:
model---from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use
:meth:`~transformers.AutoModelForPreTraining.from_pretrained` to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForPreTraining.from_pretrained` to load the model
weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -761,9 +758,9 @@ class AutoModelWithLMHead:
Instantiates one of the model classes of the library---with a language modeling head---from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :meth:`~transformers.AutoModelWithLMHead.from_pretrained`
to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelWithLMHead.from_pretrained` to load the model
weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -844,8 +841,8 @@ class AutoModelWithLMHead:
class AutoModelForCausalLM:
r"""
This is a generic model class that will be instantiated as one of the model classes of the library---with a
causal language modeling head---when created with the when created with the
This is a generic model class that will be instantiated as one of the model classes of the library---with a causal
language modeling head---when created with the when created with the
:meth:`~transformers.AutoModelForCausalLM.from_pretrained` class method or the
:meth:`~transformers.AutoModelForCausalLM.from_config` class method.
......@@ -867,9 +864,9 @@ class AutoModelForCausalLM:
configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :meth:`~transformers.AutoModelForCausalLM.from_pretrained`
to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForCausalLM.from_pretrained` to load the model
weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -938,8 +935,8 @@ class AutoModelForCausalLM:
class AutoModelForMaskedLM:
r"""
This is a generic model class that will be instantiated as one of the model classes of the library---with a
masked language modeling head---when created with the when created with the
This is a generic model class that will be instantiated as one of the model classes of the library---with a masked
language modeling head---when created with the when created with the
:meth:`~transformers.AutoModelForMaskedLM.from_pretrained` class method or the
:meth:`~transformers.AutoModelForMasedLM.from_config` class method.
......@@ -961,9 +958,9 @@ class AutoModelForMaskedLM:
configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :meth:`~transformers.AutoModelForMaskedLM.from_pretrained`
to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForMaskedLM.from_pretrained` to load the model
weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -1055,9 +1052,9 @@ class AutoModelForSeq2SeqLM:
head---from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :meth:`~transformers.AutoModelForSeq2SeqLM.from_pretrained`
to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForSeq2SeqLM.from_pretrained` to load the model
weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -1153,9 +1150,9 @@ class AutoModelForSequenceClassification:
configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use
:meth:`~transformers.AutoModelForSequenceClassification.from_pretrained` to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForSequenceClassification.from_pretrained` to load
the model weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -1250,9 +1247,9 @@ class AutoModelForQuestionAnswering:
Instantiates one of the model classes of the library---with a question answering head---from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use
:meth:`~transformers.AutoModelForQuestionAnswering.from_pretrained` to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForQuestionAnswering.from_pretrained` to load the
model weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -1327,8 +1324,8 @@ class AutoModelForQuestionAnswering:
class AutoModelForTokenClassification:
r"""
This is a generic model class that will be instantiated as one of the model classes of the library---with a
token classification head---when created with the when created with the
This is a generic model class that will be instantiated as one of the model classes of the library---with a token
classification head---when created with the when created with the
:meth:`~transformers.AutoModelForTokenClassification.from_pretrained` class method or the
:meth:`~transformers.AutoModelForTokenClassification.from_config` class method.
......@@ -1349,9 +1346,9 @@ class AutoModelForTokenClassification:
Instantiates one of the model classes of the library---with a token classification head---from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use
:meth:`~transformers.AutoModelForTokenClassification.from_pretrained` to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForTokenClassification.from_pretrained` to load
the model weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......@@ -1449,9 +1446,9 @@ class AutoModelForMultipleChoice:
configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use
:meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` to load the model weights.
Loading a model from its configuration file does **not** load the model weights. It only affects the
model's configuration. Use :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` to load the
model weights.
Args:
config (:class:`~transformers.PretrainedConfig`):
......
......@@ -68,14 +68,15 @@ BART_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
......@@ -103,14 +104,13 @@ BART_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using :class:`~transformers.BartTokenizer`.
See :meth:`transformers.PreTrainedTokenizer.encode` and
:meth:`transformers.PreTrainedTokenizer.__call__` for details.
Indices can be obtained using :class:`~transformers.BartTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
......@@ -127,16 +127,16 @@ BART_INPUTS_DOCSTRING = r"""
modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
information on the default strategy.
encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
:obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`) is a
sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder.
Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
:obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
`optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
past_key_values (:obj:`Tuple[Dict[str: tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last
``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
:obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
......@@ -160,9 +160,10 @@ def invert_mask(attention_mask):
def _prepare_bart_decoder_inputs(
config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32
):
"""Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
Note: this is not called during generation
"""
Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
generation
"""
pad_token_id = config.pad_token_id
if decoder_input_ids is None:
......@@ -292,8 +293,8 @@ class EncoderLayer(nn.Module):
class BartEncoder(nn.Module):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
is a :class:`EncoderLayer`.
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
:class:`EncoderLayer`.
Args:
config: BartConfig
......@@ -334,14 +335,14 @@ class BartEncoder(nn.Module):
Args:
input_ids (LongTensor): tokens in the source language of shape
`(batch, src_len)`
attention_mask (torch.LongTensor): indicating which indices are padding tokens.
attention_mask (torch.LongTensor): indicating which indices are padding tokens
Returns:
BaseModelOutput or Tuple comprised of:
- **x** (Tensor): the last encoder layer's output of
shape `(src_len, batch, embed_dim)`
- **encoder_states** (tuple(torch.FloatTensor)): all intermediate
hidden states of shape `(src_len, batch, embed_dim)`.
Only populated if *output_hidden_states:* is True.
- **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
- **encoder_states** (tuple(torch.FloatTensor)): all intermediate hidden states of shape `(src_len,
batch, embed_dim)`. Only populated if *output_hidden_states:* is True.
- **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout.
"""
......@@ -482,8 +483,8 @@ class DecoderLayer(nn.Module):
class BartDecoder(nn.Module):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer
is a :class:`DecoderLayer`.
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`
Args:
config: BartConfig
embed_tokens (torch.nn.Embedding): output embedding
......@@ -530,8 +531,8 @@ class BartDecoder(nn.Module):
**unused,
):
"""
Includes several features from "Jointly Learning to Align and
Translate with Transformer Models" (Garg et al., EMNLP 2019).
Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
EMNLP 2019).
Args:
input_ids (LongTensor): previous decoder outputs of shape
......@@ -543,6 +544,7 @@ class BartDecoder(nn.Module):
Returns:
BaseModelOutputWithPast or tuple:
- the decoder's features of shape `(batch, tgt_len, embed_dim)`
- the cache
- hidden states
......@@ -783,10 +785,9 @@ class BartClassificationHead(nn.Module):
class LearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
Padding ids are ignored by either offsetting based on padding_idx
or by setting padding_idx to None and ensuring that the appropriate
position ids are passed to the forward function.
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset):
......@@ -1000,10 +1001,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss.
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
with labels in ``[0, ..., config.vocab_size]``.
Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
Returns:
......@@ -1128,7 +1128,10 @@ class BartForConditionalGeneration(PretrainedBartModel):
@add_start_docstrings(
"""Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """,
"""
Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
BART_START_DOCSTRING,
)
class BartForSequenceClassification(PretrainedBartModel):
......@@ -1166,9 +1169,8 @@ class BartForSequenceClassification(PretrainedBartModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
......@@ -1214,8 +1216,10 @@ class BartForSequenceClassification(PretrainedBartModel):
@add_start_docstrings(
"""BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BART_START_DOCSTRING,
)
class BartForQuestionAnswering(PretrainedBartModel):
......@@ -1254,12 +1258,12 @@ class BartForQuestionAnswering(PretrainedBartModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if start_positions is not None and end_positions is not None:
......@@ -1332,8 +1336,9 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
@staticmethod
def _init_weight(out: nn.Parameter):
"""Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
The cos features are in the 2nd half of the vector. [dim // 2:]
"""
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
the 2nd half of the vector. [dim // 2:]
"""
n_pos, dim = out.shape
position_enc = np.array(
......
......@@ -580,8 +580,9 @@ class BertPreTrainingHeads(nn.Module):
class BertPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertConfig
......@@ -614,16 +615,16 @@ class BertForPreTrainingOutput(ModelOutput):
prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -642,14 +643,15 @@ BERT_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
BERT_INPUTS_DOCSTRING = r"""
......@@ -657,35 +659,33 @@ BERT_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`.
See :meth:`transformers.PreTrainedTokenizer.encode` and
:meth:`transformers.PreTrainedTokenizer.__call__` for details.
Indices can be obtained using :class:`~transformers.BertTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``:
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
......@@ -712,17 +712,15 @@ BERT_INPUTS_DOCSTRING = r"""
class BertModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well
as a decoder, in which case a layer of cross-attention is added between
the self-attention layers, following the architecture described in `Attention is all you need
<https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the
:obj:`is_decoder` argument of the configuration set to :obj:`True`.
To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an
:obj:`encoder_hidden_states` is then expected as an input to the forward pass.
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
......@@ -743,9 +741,9 @@ class BertModel(BertPreTrainedModel):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
......@@ -773,12 +771,11 @@ class BertModel(BertPreTrainedModel):
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
is used in the cross-attention if the model is configured as a decoder.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
......@@ -855,8 +852,10 @@ class BertModel(BertPreTrainedModel):
@add_start_docstrings(
"""Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
a `next sentence prediction (classification)` head. """,
"""
Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
sentence prediction (classification)` head.
""",
BERT_START_DOCSTRING,
)
class BertForPreTraining(BertPreTrainedModel):
......@@ -890,13 +889,12 @@ class BertForPreTraining(BertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
Indices should be in ``[0, 1]``:
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
......@@ -1004,20 +1002,18 @@ class BertLMHeadModel(BertPreTrainedModel):
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
is used in the cross-attention if the model is configured as a decoder.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the left-to-right language modeling loss (next word prediction).
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
n ``[0, ..., config.vocab_size]``
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
Returns:
......@@ -1132,10 +1128,9 @@ class BertForMaskedLM(BertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
......@@ -1229,7 +1224,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
r"""
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
(see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
......@@ -1288,8 +1283,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
@add_start_docstrings(
"""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
"""
Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
BERT_START_DOCSTRING,
)
class BertForSequenceClassification(BertPreTrainedModel):
......@@ -1325,9 +1322,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1372,8 +1368,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
@add_start_docstrings(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
BERT_START_DOCSTRING,
)
class BertForMultipleChoice(BertPreTrainedModel):
......@@ -1408,9 +1406,9 @@ class BertForMultipleChoice(BertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
of the input tensors. (See :obj:`input_ids` above)
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
:obj:`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -1461,8 +1459,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
@add_start_docstrings(
"""Bert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
"""
Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BERT_START_DOCSTRING,
)
class BertForTokenClassification(BertPreTrainedModel):
......@@ -1501,8 +1501,8 @@ class BertForTokenClassification(BertPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1550,8 +1550,10 @@ class BertForTokenClassification(BertPreTrainedModel):
@add_start_docstrings(
"""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BERT_START_DOCSTRING,
)
class BertForQuestionAnswering(BertPreTrainedModel):
......@@ -1591,12 +1593,12 @@ class BertForQuestionAnswering(BertPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment