"docs/vscode:/vscode.git/clone" did not exist on "a0b87245f375de9a2ff744867adc200882136171"
Unverified Commit 27b3031d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers

* Processors and fixes

* Last modeling files and fixes to models

* Pipeline modules

* Utils files

* Data submodule

* All the other files

* Style

* Missing examples

* Style again

* Fix copies

* Say bye bye to rst docstrings forever
parent 18587639
This diff is collapsed.
...@@ -13,10 +13,10 @@ ...@@ -13,10 +13,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Utilities to convert slow tokenizers in their fast tokenizers counterparts. Utilities to convert slow tokenizers in their fast tokenizers counterparts.
All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional. allow to make our dependency on SentencePiece optional.
""" """
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
...@@ -960,13 +960,13 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: ...@@ -960,13 +960,13 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
Utilities to convert a slow tokenizer instance in a fast tokenizer instance. Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
Args: Args:
transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`): transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
Instance of a slow tokenizer to convert in the backend tokenizer for Instance of a slow tokenizer to convert in the backend tokenizer for
:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`. [`~tokenization_utils_base.PreTrainedTokenizerFast`].
Return: Return:
A instance of :class:`~tokenizers.Tokenizer` to be used as the backend tokenizer of a A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast` [`~tokenization_utils_base.PreTrainedTokenizerFast`]
""" """
tokenizer_class_name = transformer_tokenizer.__class__.__name__ tokenizer_class_name = transformer_tokenizer.__class__.__name__
......
...@@ -50,8 +50,8 @@ def default_data_collator(features: List[InputDataClass], return_tensors="pt") - ...@@ -50,8 +50,8 @@ def default_data_collator(features: List[InputDataClass], return_tensors="pt") -
Very simple data collator that simply collates batches of dict-like objects and performs special handling for Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named: potential keys named:
- ``label``: handles a single value (int or float) per object - `label`: handles a single value (int or float) per object
- ``label_ids``: handles a list of values per object - `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful. to the model. See glue and ner for example of how it's useful.
...@@ -76,8 +76,8 @@ class DefaultDataCollator(DataCollatorMixin): ...@@ -76,8 +76,8 @@ class DefaultDataCollator(DataCollatorMixin):
Very simple data collator that simply collates batches of dict-like objects and performs special handling for Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named: potential keys named:
- ``label``: handles a single value (int or float) per object - `label`: handles a single value (int or float) per object
- ``label_ids``: handles a list of values per object - `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful. to the model. See glue and ner for example of how it's useful.
...@@ -86,7 +86,7 @@ class DefaultDataCollator(DataCollatorMixin): ...@@ -86,7 +86,7 @@ class DefaultDataCollator(DataCollatorMixin):
helpful if you need to set a return_tensors value at initialization. helpful if you need to set a return_tensors value at initialization.
Args: Args:
return_tensors (:obj:`str`): return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf". The type of Tensor to return. Allowable values are "np", "pt" and "tf".
""" """
...@@ -213,26 +213,26 @@ class DataCollatorWithPadding: ...@@ -213,26 +213,26 @@ class DataCollatorWithPadding:
Data collator that will dynamically pad the inputs received. Data collator that will dynamically pad the inputs received.
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided). sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths). different lengths).
max_length (:obj:`int`, `optional`): max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above). Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta). 7.5 (Volta).
return_tensors (:obj:`str`): return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf". The type of Tensor to return. Allowable values are "np", "pt" and "tf".
""" """
...@@ -265,28 +265,28 @@ class DataCollatorForTokenClassification(DataCollatorMixin): ...@@ -265,28 +265,28 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
Data collator that will dynamically pad the inputs received, as well as the labels. Data collator that will dynamically pad the inputs received, as well as the labels.
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided). sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths). different lengths).
max_length (:obj:`int`, `optional`): max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above). Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta). 7.5 (Volta).
label_pad_token_id (:obj:`int`, `optional`, defaults to -100): label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
return_tensors (:obj:`str`): return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf". The type of Tensor to return. Allowable values are "np", "pt" and "tf".
""" """
...@@ -515,33 +515,33 @@ class DataCollatorForSeq2Seq: ...@@ -515,33 +515,33 @@ class DataCollatorForSeq2Seq:
Data collator that will dynamically pad the inputs received, as well as the labels. Data collator that will dynamically pad the inputs received, as well as the labels.
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
model (:class:`~transformers.PreTrainedModel`): model ([`PreTrainedModel`]):
The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
prepare the `decoder_input_ids` prepare the *decoder_input_ids*
This is useful when using `label_smoothing` to avoid calculating loss twice. This is useful when using *label_smoothing* to avoid calculating loss twice.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided). sequence is provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths). different lengths).
max_length (:obj:`int`, `optional`): max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above). Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta). 7.5 (Volta).
label_pad_token_id (:obj:`int`, `optional`, defaults to -100): label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
return_tensors (:obj:`str`): return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf". The type of Tensor to return. Allowable values are "np", "pt" and "tf".
""" """
...@@ -605,26 +605,27 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): ...@@ -605,26 +605,27 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
are not all of the same length. are not all of the same length.
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
mlm (:obj:`bool`, `optional`, defaults to :obj:`True`): mlm (`bool`, *optional*, defaults to `True`):
Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the Whether or not to use masked language modeling. If set to `False`, the labels are the same as the
inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
non-masked tokens and the value to predict for the masked token. non-masked tokens and the value to predict for the masked token.
mlm_probability (:obj:`float`, `optional`, defaults to 0.15): mlm_probability (`float`, *optional*, defaults to 0.15):
The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`. The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. If set will pad the sequence to a multiple of the provided value.
return_tensors (:obj:`str`): return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf". The type of Tensor to return. Allowable values are "np", "pt" and "tf".
.. note:: <Tip>
For best performance, this data collator should be used with a dataset having items that are dictionaries or For best performance, this data collator should be used with a dataset having items that are dictionaries or
BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a BatchEncoding, with the `"special_tokens_mask"` key, as returned by a
:class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the [`PreTrainedTokenizer`] or a [`PreTrainedTokenizerFast`] with the
argument :obj:`return_special_tokens_mask=True`. argument `return_special_tokens_mask=True`.
"""
</Tip>"""
tokenizer: PreTrainedTokenizerBase tokenizer: PreTrainedTokenizerBase
mlm: bool = True mlm: bool = True
...@@ -845,13 +846,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): ...@@ -845,13 +846,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
- collates batches of tensors, honoring their tokenizer's pad_token - collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling - preprocesses batches for masked language modeling
.. note:: <Tip>
This collator relies on details of the implementation of subword tokenization by This collator relies on details of the implementation of subword tokenization by
:class:`~transformers.BertTokenizer`, specifically that subword tokens are prefixed with `##`. For tokenizers [`BertTokenizer`], specifically that subword tokens are prefixed with *##*. For tokenizers
that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
:class:`.DataCollatorForLanguageModeling`. [`.DataCollatorForLanguageModeling`].
"""
</Tip>"""
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)): if isinstance(examples[0], (dict, BatchEncoding)):
...@@ -1227,14 +1229,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin): ...@@ -1227,14 +1229,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far). 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked) masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
span_length]`` and mask tokens ``start_index:start_index + span_length`` 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1. the sequence to be processed), repeat from Step 1.
""" """
import torch import torch
...@@ -1325,14 +1326,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin): ...@@ -1325,14 +1326,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far). 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked) masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
span_length]`` and mask tokens ``start_index:start_index + span_length`` 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1. the sequence to be processed), repeat from Step 1.
""" """
from random import randint from random import randint
...@@ -1434,14 +1434,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin): ...@@ -1434,14 +1434,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far). 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked) masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
span_length]`` and mask tokens ``start_index:start_index + span_length`` 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1. the sequence to be processed), repeat from Step 1.
""" """
from random import randint from random import randint
......
...@@ -48,20 +48,20 @@ def glue_convert_examples_to_features( ...@@ -48,20 +48,20 @@ def glue_convert_examples_to_features(
output_mode=None, output_mode=None,
): ):
""" """
Loads a data file into a list of ``InputFeatures`` Loads a data file into a list of `InputFeatures`
Args: Args:
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length. Defaults to the tokenizer's max_len max_length: Maximum example length. Defaults to the tokenizer's max_len
task: GLUE task task: GLUE task
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
output_mode: String indicating the output mode. Either ``regression`` or ``classification`` output_mode: String indicating the output mode. Either `regression` or `classification`
Returns: Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
``InputFeatures`` which can be fed to the model. `InputFeatures` which can be fed to the model.
""" """
warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning) warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
...@@ -84,7 +84,7 @@ if is_tf_available(): ...@@ -84,7 +84,7 @@ if is_tf_available():
) -> tf.data.Dataset: ) -> tf.data.Dataset:
""" """
Returns: Returns:
A ``tf.data.Dataset`` containing the task-specific features. A `tf.data.Dataset` containing the task-specific features.
""" """
processor = glue_processors[task]() processor = glue_processors[task]()
......
...@@ -332,8 +332,8 @@ def squad_convert_examples_to_features( ...@@ -332,8 +332,8 @@ def squad_convert_examples_to_features(
model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args: Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample` examples: list of [`~data.processors.squad.SquadExample`]
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` tokenizer: an instance of a child of [`PreTrainedTokenizer`]
max_seq_length: The maximum sequence length of the inputs. max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features. doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query. max_query_length: The maximum length of the query.
...@@ -345,22 +345,23 @@ def squad_convert_examples_to_features( ...@@ -345,22 +345,23 @@ def squad_convert_examples_to_features(
Returns: Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures` list of [`~data.processors.squad.SquadFeatures`]
Example:: Example:
processor = SquadV2Processor() ```python
examples = processor.get_dev_examples(data_dir) processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features( features = squad_convert_examples_to_features(
examples=examples, examples=examples,
tokenizer=tokenizer, tokenizer=tokenizer,
max_seq_length=args.max_seq_length, max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride, doc_stride=args.doc_stride,
max_query_length=args.max_query_length, max_query_length=args.max_query_length,
is_training=not evaluate, is_training=not evaluate,
) )
""" ```"""
# Defining helper methods # Defining helper methods
features = [] features = []
...@@ -574,23 +575,24 @@ class SquadProcessor(DataProcessor): ...@@ -574,23 +575,24 @@ class SquadProcessor(DataProcessor):
def get_examples_from_dataset(self, dataset, evaluate=False): def get_examples_from_dataset(self, dataset, evaluate=False):
""" """
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset. Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
Args: Args:
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
evaluate: Boolean specifying if in evaluation mode or in training mode evaluate: Boolean specifying if in evaluation mode or in training mode
Returns: Returns:
List of SquadExample List of SquadExample
Examples:: Examples:
>>> import tensorflow_datasets as tfds ```python
>>> dataset = tfds.load("squad") >>> import tensorflow_datasets as tfds
>>> dataset = tfds.load("squad")
>>> training_examples = get_examples_from_dataset(dataset, evaluate=False) >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
>>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
""" ```"""
if evaluate: if evaluate:
dataset = dataset["validation"] dataset = dataset["validation"]
...@@ -759,8 +761,8 @@ class SquadExample: ...@@ -759,8 +761,8 @@ class SquadExample:
class SquadFeatures: class SquadFeatures:
""" """
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
:class:`~transformers.data.processors.squad.SquadExample` using the [`~data.processors.squad.SquadExample`] using the
:method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method. :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
Args: Args:
input_ids: Indices of input sequence tokens in the vocabulary. input_ids: Indices of input sequence tokens in the vocabulary.
......
...@@ -60,7 +60,7 @@ class InputFeatures: ...@@ -60,7 +60,7 @@ class InputFeatures:
Args: Args:
input_ids: Indices of input sequence tokens in the vocabulary. input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices. attention_mask: Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
tokens. tokens.
token_type_ids: (Optional) Segment token indices to indicate first and second token_type_ids: (Optional) Segment token indices to indicate first and second
portions of the inputs. Only some models use them. portions of the inputs. Only some models use them.
...@@ -92,15 +92,15 @@ class DataProcessor: ...@@ -92,15 +92,15 @@ class DataProcessor:
raise NotImplementedError() raise NotImplementedError()
def get_train_examples(self, data_dir): def get_train_examples(self, data_dir):
"""Gets a collection of :class:`InputExample` for the train set.""" """Gets a collection of [`InputExample`] for the train set."""
raise NotImplementedError() raise NotImplementedError()
def get_dev_examples(self, data_dir): def get_dev_examples(self, data_dir):
"""Gets a collection of :class:`InputExample` for the dev set.""" """Gets a collection of [`InputExample`] for the dev set."""
raise NotImplementedError() raise NotImplementedError()
def get_test_examples(self, data_dir): def get_test_examples(self, data_dir):
"""Gets a collection of :class:`InputExample` for the test set.""" """Gets a collection of [`InputExample`] for the test set."""
raise NotImplementedError() raise NotImplementedError()
def get_labels(self): def get_labels(self):
...@@ -240,21 +240,21 @@ class SingleSentenceClassificationProcessor(DataProcessor): ...@@ -240,21 +240,21 @@ class SingleSentenceClassificationProcessor(DataProcessor):
return_tensors=None, return_tensors=None,
): ):
""" """
Convert examples in a list of ``InputFeatures`` Convert examples in a list of `InputFeatures`
Args: Args:
tokenizer: Instance of a tokenizer that will tokenize the examples tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length max_length: Maximum example length
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
pad_token: Padding token pad_token: Padding token
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for
actual values) actual values)
Returns: Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
``InputFeatures`` which can be fed to the model. `InputFeatures` which can be fed to the model.
""" """
if max_length is None: if max_length is None:
......
...@@ -28,7 +28,7 @@ logger = logging.get_logger(__name__) ...@@ -28,7 +28,7 @@ logger = logging.get_logger(__name__)
class DebugUnderflowOverflow: class DebugUnderflowOverflow:
""" """
This debug class helps detect and understand where the model starts getting very large or very small, and more This debug class helps detect and understand where the model starts getting very large or very small, and more
importantly ``nan`` or ``inf`` weight and activation elements. importantly `nan` or `inf` weight and activation elements.
There are 2 working modes: There are 2 working modes:
...@@ -37,69 +37,77 @@ class DebugUnderflowOverflow: ...@@ -37,69 +37,77 @@ class DebugUnderflowOverflow:
Mode 1: Underflow/overflow detection Mode 1: Underflow/overflow detection
To activate the underflow/overflow detection, initialize the object with the model :: To activate the underflow/overflow detection, initialize the object with the model :
debug_overflow = DebugUnderflowOverflow(model) ```python
debug_overflow = DebugUnderflowOverflow(model)
```
then run the training as normal and if ``nan`` or ``inf`` gets detected in at least one of the weight, input or then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
output elements this module will throw an exception and will print ``max_frames_to_save`` frames that lead to this output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
event, each frame reporting event, each frame reporting
1. the fully qualified module name plus the class name whose ``forward`` was run 1. the fully qualified module name plus the class name whose `forward` was run
2. the absolute min and max value of all elements for each module weights, and the inputs and output 2. the absolute min and max value of all elements for each module weights, and the inputs and output
For example, here is the header and the last few frames in detection report for ``google/mt5-small`` run in fp16 mixed precision :: For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
Detected inf/nan during batch_number=0 ```
Last 21 forward frames: Detected inf/nan during batch_number=0
abs min abs max metadata Last 21 forward frames:
[...] abs min abs max metadata
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear [...]
2.17e-07 4.50e+00 weight encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
1.79e-06 4.65e+00 input[0] 2.17e-07 4.50e+00 weight
2.68e-06 3.70e+01 output 1.79e-06 4.65e+00 input[0]
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear 2.68e-06 3.70e+01 output
8.08e-07 2.66e+01 weight encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
1.79e-06 4.65e+00 input[0] 8.08e-07 2.66e+01 weight
1.27e-04 2.37e+02 output 1.79e-06 4.65e+00 input[0]
encoder.block.2.layer.1.DenseReluDense.wo Linear 1.27e-04 2.37e+02 output
1.01e-06 6.44e+00 weight encoder.block.2.layer.1.DenseReluDense.wo Linear
0.00e+00 9.74e+03 input[0] 1.01e-06 6.44e+00 weight
3.18e-04 6.27e+04 output 0.00e+00 9.74e+03 input[0]
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense 3.18e-04 6.27e+04 output
1.79e-06 4.65e+00 input[0] encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
3.18e-04 6.27e+04 output 1.79e-06 4.65e+00 input[0]
encoder.block.2.layer.1.dropout Dropout 3.18e-04 6.27e+04 output
3.18e-04 6.27e+04 input[0] encoder.block.2.layer.1.dropout Dropout
0.00e+00 inf output 3.18e-04 6.27e+04 input[0]
0.00e+00 inf output
You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value ```
was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which
You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
64K, and we get an overlow. 64K, and we get an overlow.
As you can see it's the previous frames that we need to look into when the numbers start going into very large for As you can see it's the previous frames that we need to look into when the numbers start going into very large for
fp16 numbers. fp16 numbers.
The tracking is done in a forward hook, which gets invoked immediately after ``forward`` has completed. The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :: By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) ```python
debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
```
To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
section. section.
Mode 2. Specific batch absolute min/max tracing without detection Mode 2. Specific batch absolute min/max tracing without detection
The second work mode is per-batch tracing with the underflow/overflow detection feature turned off. The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
given batch, and only do that for batches 1 and 3. Then you instantiate this class as :: given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3]) ```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
```
And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed. And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
...@@ -109,28 +117,29 @@ class DebugUnderflowOverflow: ...@@ -109,28 +117,29 @@ class DebugUnderflowOverflow:
Early stopping: Early stopping:
You can also specify the batch number after which to stop the training, with :: You can also specify the batch number after which to stop the training, with :
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3) ```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
```
This feature is mainly useful in the tracing mode, but you can use it for any mode. This feature is mainly useful in the tracing mode, but you can use it for any mode.
**Performance**: **Performance**:
As this module measures absolute ``min``/``max`` of each weight of the model on every forward it'll slow the As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
training down. Therefore remember to turn it off once the debugging needs have been met. training down. Therefore remember to turn it off once the debugging needs have been met.
Args: Args:
model (:obj:`nn.Module`): model (`nn.Module`):
The model to debug. The model to debug.
max_frames_to_save (:obj:`int`, `optional`, defaults to 21): max_frames_to_save (`int`, *optional*, defaults to 21):
How many frames back to record How many frames back to record
trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``): trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
Which batch numbers to trace (turns detection off) Which batch numbers to trace (turns detection off)
abort_after_batch_num (:obj:`int`, `optional`): abort_after_batch_num (`int``, *optional*):
Whether to abort after a certain batch number has finished Whether to abort after a certain batch number has finished
""" """
def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None): def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
...@@ -287,7 +296,7 @@ def get_abs_min_max(var, ctx): ...@@ -287,7 +296,7 @@ def get_abs_min_max(var, ctx):
def detect_overflow(var, ctx): def detect_overflow(var, ctx):
""" """
Report whether the tensor contains any ``nan`` or ``inf`` entries. Report whether the tensor contains any `nan` or `inf` entries.
This is useful for detecting overflows/underflows and best to call right after the function that did some math that This is useful for detecting overflows/underflows and best to call right after the function that did some math that
modified the tensor in question. modified the tensor in question.
...@@ -300,7 +309,7 @@ def detect_overflow(var, ctx): ...@@ -300,7 +309,7 @@ def detect_overflow(var, ctx):
ctx: the message to print as a context ctx: the message to print as a context
Return: Return:
:obj:`True` if ``inf`` or ``nan`` was detected, :obj:`False` otherwise `True` if `inf` or `nan` was detected, `False` otherwise
""" """
detected = False detected = False
if torch.isnan(var).any().item(): if torch.isnan(var).any().item():
......
...@@ -41,16 +41,16 @@ class HfDeepSpeedConfig: ...@@ -41,16 +41,16 @@ class HfDeepSpeedConfig:
""" """
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``). things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`).
Therefore it's important that this object remains alive while the program is still running. Therefore it's important that this object remains alive while the program is still running.
:class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to
sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder sync the configuration with values of [`TrainingArguments`] by replacing special placeholder
values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way. values: `"auto"`. Without this special logic the DeepSpeed configuration is not modified in any way.
Args: Args:
config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict. config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
""" """
...@@ -104,7 +104,7 @@ class HfDeepSpeedConfig: ...@@ -104,7 +104,7 @@ class HfDeepSpeedConfig:
def get_value(self, ds_key_long, default=None): def get_value(self, ds_key_long, default=None):
""" """
Returns the set value or ``default`` if no value is set Returns the set value or `default` if no value is set
""" """
config, ds_key = self.find_config_node(ds_key_long) config, ds_key = self.find_config_node(ds_key_long)
if config is None: if config is None:
...@@ -115,7 +115,7 @@ class HfDeepSpeedConfig: ...@@ -115,7 +115,7 @@ class HfDeepSpeedConfig:
""" """
Deletes a sub-section of the config file if it's found. Deletes a sub-section of the config file if it's found.
Unless ``must_exist`` is :obj:`True` the section doesn't have to exist. Unless `must_exist` is `True` the section doesn't have to exist.
""" """
config = self.config config = self.config
...@@ -136,8 +136,7 @@ class HfDeepSpeedConfig: ...@@ -136,8 +136,7 @@ class HfDeepSpeedConfig:
def is_true(self, ds_key_long): def is_true(self, ds_key_long):
""" """
Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `True` (and it's not set to `False`` or
ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or
isn't set). isn't set).
""" """
...@@ -146,8 +145,7 @@ class HfDeepSpeedConfig: ...@@ -146,8 +145,7 @@ class HfDeepSpeedConfig:
def is_false(self, ds_key_long): def is_false(self, ds_key_long):
""" """
Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `False` (and it's not set to `True`` or
ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or
isn't set). isn't set).
""" """
value = self.get_value(ds_key_long) value = self.get_value(ds_key_long)
...@@ -165,7 +163,7 @@ class HfDeepSpeedConfig: ...@@ -165,7 +163,7 @@ class HfDeepSpeedConfig:
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
""" """
The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has
the same lifespan as the latter. the same lifespan as the latter.
""" """
...@@ -181,11 +179,11 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): ...@@ -181,11 +179,11 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
""" """
A utility method that massages the config file and can optionally verify that the values match. A utility method that massages the config file and can optionally verify that the values match.
1. Replace "auto" values with ``TrainingArguments`` value. 1. Replace "auto" values with `TrainingArguments` value.
2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer 2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to ``self.mismatched`` - will assert during config values and if mismatched add the entry to `self.mismatched` - will assert during
``trainer_config_finalize`` for one or more mismatches. `trainer_config_finalize` for one or more mismatches.
""" """
config, ds_key = self.find_config_node(ds_key_long) config, ds_key = self.find_config_node(ds_key_long)
...@@ -207,7 +205,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): ...@@ -207,7 +205,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
def trainer_config_process(self, args): def trainer_config_process(self, args):
""" """
Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation. creation.
""" """
# DeepSpeed does: # DeepSpeed does:
...@@ -373,7 +371,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf ...@@ -373,7 +371,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
""" """
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
Args: Args:
trainer: Trainer object trainer: Trainer object
......
...@@ -40,11 +40,11 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -40,11 +40,11 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
This is a general feature extraction class for speech recognition. This is a general feature extraction class for speech recognition.
Args: Args:
feature_size (:obj:`int`): feature_size (`int`):
The feature dimension of the extracted features. The feature dimension of the extracted features.
sampling_rate (:obj:`int`): sampling_rate (`int`):
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz). The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
padding_value (:obj:`float`): padding_value (`float`):
The value that is used to fill the padding values / vectors. The value that is used to fill the padding values / vectors.
""" """
...@@ -79,53 +79,54 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -79,53 +79,54 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
max sequence length in the batch. max sequence length in the batch.
Padding side (left/right) padding values are defined at the feature extractor level (with Padding side (left/right) padding values are defined at the feature extractor level (with
``self.padding_side``, ``self.padding_value``) `self.padding_side`, `self.padding_value`)
.. note:: <Tip>
If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
the result will use the same type unless you provide a different tensor type with ``return_tensors``. In the result will use the same type unless you provide a different tensor type with `return_tensors`. In
the case of PyTorch tensors, you will lose the specific device of your tensors however. the case of PyTorch tensors, you will lose the specific device of your tensors however.
</Tip>
Args: Args:
processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`): processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str, Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of input values / vectors (list of [`BatchFeature`],
List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`, *Dict[str, List[List[float]]]* or *List[Dict[str, List[float]]]*) so you can use this method during
`Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
preprocessing as well as in a PyTorch Dataloader collate function. preprocessing as well as in a PyTorch Dataloader collate function.
Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
tensors), see the note above for the return type. tensors), see the note above for the return type.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among: index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided). single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths). different lengths).
max_length (:obj:`int`, `optional`): max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above). Maximum length of the returned list and optionally padding length (see above).
truncation (:obj:`bool`): truncation (`bool`):
Activates truncation to cut input sequences longer than :obj:`max_length` to :obj:`max_length`. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (:obj:`bool`, `optional`): return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default. to the specific feature_extractor's default.
`What are attention masks? <../glossary.html#attention-mask>`__ [What are attention masks?](../glossary#attention-mask)
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. - `'tf'`: Return TensorFlow `tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. - `'np'`: Return Numpy `np.ndarray` objects.
""" """
# If we have a list of dicts, let's convert it in a dict of lists # If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
......
...@@ -54,16 +54,16 @@ PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # noqa: F821 ...@@ -54,16 +54,16 @@ PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # noqa: F821
class BatchFeature(UserDict): class BatchFeature(UserDict):
r""" r"""
Holds the output of the :meth:`~transformers.SequenceFeatureExtractor.pad` and feature extractor specific Holds the output of the [`~SequenceFeatureExtractor.pad`] and feature extractor specific
``__call__`` methods. `__call__` methods.
This class is derived from a python dictionary and can be used as a dictionary. This class is derived from a python dictionary and can be used as a dictionary.
Args: Args:
data (:obj:`dict`): data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask', Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
etc.). etc.).
tensor_type (:obj:`Union[None, str, TensorType]`, `optional`): tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization. initialization.
""" """
...@@ -74,7 +74,7 @@ class BatchFeature(UserDict): ...@@ -74,7 +74,7 @@ class BatchFeature(UserDict):
def __getitem__(self, item: str) -> Union[Any]: def __getitem__(self, item: str) -> Union[Any]:
""" """
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values', If the key is a string, returns the value of the dict associated to `key` ('input_values',
'attention_mask', etc.). 'attention_mask', etc.).
""" """
if isinstance(item, str): if isinstance(item, str):
...@@ -112,9 +112,9 @@ class BatchFeature(UserDict): ...@@ -112,9 +112,9 @@ class BatchFeature(UserDict):
Convert the inner content to tensors. Convert the inner content to tensors.
Args: Args:
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): tensor_type (`str` or [`~file_utils.TensorType`], *optional*):
The type of tensors to use. If :obj:`str`, should be one of the values of the enum The type of tensors to use. If `str`, should be one of the values of the enum
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done. [`~file_utils.TensorType`]. If `None`, no modification is done.
""" """
if tensor_type is None: if tensor_type is None:
return self return self
...@@ -176,13 +176,13 @@ class BatchFeature(UserDict): ...@@ -176,13 +176,13 @@ class BatchFeature(UserDict):
# Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
def to(self, device: Union[str, "torch.device"]) -> "BatchFeature": def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
""" """
Send all values to device by calling :obj:`v.to(device)` (PyTorch only). Send all values to device by calling `v.to(device)` (PyTorch only).
Args: Args:
device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on. device (`str` or `torch.device`): The device to put the tensors on.
Returns: Returns:
:class:`~transformers.BatchFeature`: The same instance after modification. [`BatchFeature`]: The same instance after modification.
""" """
# This check catches things like APEX blindly calling "to" on all inputs to a module # This check catches things like APEX blindly calling "to" on all inputs to a module
...@@ -216,83 +216,84 @@ class FeatureExtractionMixin: ...@@ -216,83 +216,84 @@ class FeatureExtractionMixin:
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> PreTrainedFeatureExtractor: ) -> PreTrainedFeatureExtractor:
r""" r"""
Instantiate a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a feature Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature
extractor, *e.g.* a derived class of :class:`~transformers.SequenceFeatureExtractor`. extractor, *e.g.* a derived class of [`SequenceFeatureExtractor`].
Args: Args:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either: This can be either:
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a `directory` containing a feature extractor file saved using the - a path to a *directory* containing a feature extractor file saved using the
:func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g., [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
``./my_model_directory/``. `./my_model_directory/`.
- a path or url to a saved feature extractor JSON `file`, e.g., - a path or url to a saved feature extractor JSON *file*, e.g.,
``./my_model_directory/preprocessor_config.json``. `./my_model_directory/preprocessor_config.json`.
cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model feature extractor should be cached if the Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
standard cache should not be used. standard cache should not be used.
force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the feature extractor files and override the cached versions Whether or not to force to (re-)download the feature extractor files and override the cached versions
if they exist. if they exist.
resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file Whether or not to delete incompletely received file. Attempts to resume the download if such a file
exists. exists.
proxies (:obj:`Dict[str, str]`, `optional`): proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. use_auth_token (`str` or *bool*, *optional*):
use_auth_token (:obj:`str` or `bool`, `optional`): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated when running `transformers-cli login` (stored in `~/.huggingface`).
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). revision(`str`, *optional*, defaults to `"main"`):
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git. identifier allowed by git.
return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): return_unused_kwargs (`bool`, *optional*, defaults to `False`):
If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`, If `False`, then this function returns just the final feature extractor object. If `True`,
then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored. part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
kwargs (:obj:`Dict[str, Any]`, `optional`): kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
controlled by the ``return_unused_kwargs`` keyword parameter. controlled by the `return_unused_kwargs` keyword parameter.
.. note:: <Tip>
Passing :obj:`use_auth_token=True` is required when you want to use a private model. Passing `use_auth_token=True` is required when you want to use a private model.
</Tip>
Returns: Returns:
A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`. A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
Examples:: Examples:
# We can't instantiate directly the base class `FeatureExtractionMixin` nor `SequenceFeatureExtractor` so let's show the examples on a ```python
# derived class: `Wav2Vec2FeatureExtractor` # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache. # derived class: *Wav2Vec2FeatureExtractor*
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')` feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache.
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json') feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
assert feature_extractor.return_attention_mask is False feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, assert feature_extractor.return_attention_mask is False
foo=False, return_unused_kwargs=True) feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
assert feature_extractor.return_attention_mask is False foo=False, return_unused_kwargs=True)
assert unused_kwargs == {'foo': False} assert feature_extractor.return_attention_mask is False
""" assert unused_kwargs == {'foo': False}
```"""
feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(feature_extractor_dict, **kwargs) return cls.from_dict(feature_extractor_dict, **kwargs)
def save_pretrained(self, save_directory: Union[str, os.PathLike]): def save_pretrained(self, save_directory: Union[str, os.PathLike]):
""" """
Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
:func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` class method. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
Args: Args:
save_directory (:obj:`str` or :obj:`os.PathLike`): save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file will be saved (will be created if it does not exist). Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
""" """
if os.path.isfile(save_directory): if os.path.isfile(save_directory):
...@@ -309,16 +310,16 @@ class FeatureExtractionMixin: ...@@ -309,16 +310,16 @@ class FeatureExtractionMixin:
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]: ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
""" """
From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` using feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using
``from_dict``. `from_dict`.
Parameters: Parameters:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns: Returns:
:obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
object. object.
""" """
cache_dir = kwargs.pop("cache_dir", None) cache_dir = kwargs.pop("cache_dir", None)
...@@ -397,19 +398,19 @@ class FeatureExtractionMixin: ...@@ -397,19 +398,19 @@ class FeatureExtractionMixin:
@classmethod @classmethod
def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor: def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
""" """
Instantiates a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a Python Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python
dictionary of parameters. dictionary of parameters.
Args: Args:
feature_extractor_dict (:obj:`Dict[str, Any]`): feature_extractor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the retrieved from a pretrained checkpoint by leveraging the
:func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.to_dict` method. [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
kwargs (:obj:`Dict[str, Any]`): kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the feature extractor object. Additional parameters from which to initialize the feature extractor object.
Returns: Returns:
:class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The feature extractor object [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature extractor object
instantiated from those parameters. instantiated from those parameters.
""" """
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
...@@ -436,7 +437,7 @@ class FeatureExtractionMixin: ...@@ -436,7 +437,7 @@ class FeatureExtractionMixin:
Serializes this instance to a Python dictionary. Serializes this instance to a Python dictionary.
Returns: Returns:
:obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance. `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__ output["feature_extractor_type"] = self.__class__.__name__
...@@ -446,15 +447,15 @@ class FeatureExtractionMixin: ...@@ -446,15 +447,15 @@ class FeatureExtractionMixin:
@classmethod @classmethod
def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor: def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor:
""" """
Instantiates a feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]
from the path to a JSON file of parameters. from the path to a JSON file of parameters.
Args: Args:
json_file (:obj:`str` or :obj:`os.PathLike`): json_file (`str` or `os.PathLike`):
Path to the JSON file containing the parameters. Path to the JSON file containing the parameters.
Returns: Returns:
A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The
feature_extractor object instantiated from that JSON file. feature_extractor object instantiated from that JSON file.
""" """
with open(json_file, "r", encoding="utf-8") as reader: with open(json_file, "r", encoding="utf-8") as reader:
...@@ -467,7 +468,7 @@ class FeatureExtractionMixin: ...@@ -467,7 +468,7 @@ class FeatureExtractionMixin:
Serializes this instance to a JSON string. Serializes this instance to a JSON string.
Returns: Returns:
:obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON `str`: String containing all the attributes that make up this feature_extractor instance in JSON
format. format.
""" """
dictionary = self.to_dict() dictionary = self.to_dict()
...@@ -483,7 +484,7 @@ class FeatureExtractionMixin: ...@@ -483,7 +484,7 @@ class FeatureExtractionMixin:
Save this instance to a JSON file. Save this instance to a JSON file.
Args: Args:
json_file_path (:obj:`str` or :obj:`os.PathLike`): json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this feature_extractor instance's parameters will be saved. Path to the JSON file in which this feature_extractor instance's parameters will be saved.
""" """
with open(json_file_path, "w", encoding="utf-8") as writer: with open(json_file_path, "w", encoding="utf-8") as writer:
......
...@@ -25,70 +25,70 @@ from .file_utils import add_start_docstrings ...@@ -25,70 +25,70 @@ from .file_utils import add_start_docstrings
PROCESS_INPUTS_DOCSTRING = r""" PROCESS_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details. details.
`What are input IDs? <../glossary.html#input-ids>`__ [What are input IDs?](../glossary#input-ids)
next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`): next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses. Current scores of the top `2 * num_beams` non-finished beam hypotheses.
next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
:obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses. `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond. Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
pad_token_id (:obj:`int`, `optional`): pad_token_id (`int`, *optional*):
The id of the `padding` token. The id of the *padding* token.
eos_token_id (:obj:`int`, `optional`): eos_token_id (`int`, *optional*):
The id of the `end-of-sequence` token. The id of the *end-of-sequence* token.
Return: Return:
:obj:`UserDict`: A dictionary composed of the fields as defined above: `UserDict`: A dictionary composed of the fields as defined above:
- **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated
scores of all non-finished beams. scores of all non-finished beams.
- **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens
to be added to the non-finished beam_hypotheses. to be added to the non-finished beam_hypotheses.
- **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
indicating to which beam the next tokens shall be added. indicating to which beam the next tokens shall be added.
""" """
FINALIZE_INPUTS_DOCSTRING = r""" FINALIZE_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details. details.
`What are input IDs? <../glossary.html#input-ids>`__ [What are input IDs?](../glossary#input-ids)
final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
The final scores of all non-finished beams. The final scores of all non-finished beams.
final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
The last tokens to be added to the non-finished beam_hypotheses. The last tokens to be added to the non-finished beam_hypotheses.
final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added. The beam indices indicating to which beam the `final_beam_tokens` shall be added.
pad_token_id (:obj:`int`, `optional`): pad_token_id (`int`, *optional*):
The id of the `padding` token. The id of the *padding* token.
eos_token_id (:obj:`int`, `optional`): eos_token_id (`int`, *optional*):
The id of the `end-of-sequence` token. The id of the *end-of-sequence* token.
Return: Return:
:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
batches finished early due to the :obj:`eos_token_id`. batches finished early due to the `eos_token_id`.
""" """
class BeamScorer(ABC): class BeamScorer(ABC):
""" """
Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
:meth:`~transformers.PreTrainedModel.beam_sample`. [`~PreTrainedModel.beam_sample`].
""" """
@abstractmethod @abstractmethod
...@@ -119,36 +119,34 @@ class BeamScorer(ABC): ...@@ -119,36 +119,34 @@ class BeamScorer(ABC):
class BeamSearchScorer(BeamScorer): class BeamSearchScorer(BeamScorer):
r""" r"""
:class:`transformers.BeamScorer` implementing standard beam search decoding. [`BeamScorer`] implementing standard beam search decoding.
Adapted in part from `Facebook's XLM beam search code Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
<https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
<https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__
Args: Args:
batch_size (:obj:`int`): batch_size (`int`):
Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel. Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
max_length (:obj:`int`): max_length (`int`):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
num_beams (:obj:`int`): num_beams (`int`):
Number of beams for beam search. Number of beams for beam search.
device (:obj:`torch.device`): device (`torch.device`):
Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of
:obj:`BeamSearchScorer` will be allocated. `BeamSearchScorer` will be allocated.
length_penalty (:obj:`float`, `optional`, defaults to 1.0): length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
sequences. sequences.
do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): do_early_stopping (`bool`, *optional*, defaults to `False`):
Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1): num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
The number of beam hypotheses that shall be returned upon calling The number of beam hypotheses that shall be returned upon calling
:meth:`~transformer.BeamSearchScorer.finalize`. [`~transformer.BeamSearchScorer.finalize`].
num_beam_groups (:obj:`int`): num_beam_groups (`int`):
Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
""" """
def __init__( def __init__(
......
...@@ -29,22 +29,22 @@ logger = get_logger(__name__) ...@@ -29,22 +29,22 @@ logger = get_logger(__name__)
LOGITS_PROCESSOR_INPUTS_DOCSTRING = r""" LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`): input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See Indices can be obtained using [`PreTrainedTokenizer`]. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details. details.
`What are input IDs? <../glossary.html#input-ids>`__ [What are input IDs?](../glossary#input-ids)
scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`): scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search search or log softmax for each vocabulary token when using beam search
kwargs: kwargs:
Additional logits processor specific kwargs. Additional logits processor specific kwargs.
Return: Return:
:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores. `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
""" """
...@@ -73,10 +73,10 @@ class FlaxLogitsWarper(ABC): ...@@ -73,10 +73,10 @@ class FlaxLogitsWarper(ABC):
class FlaxLogitsProcessorList(list): class FlaxLogitsProcessorList(list):
""" """
This class can be used to create a list of :class:`~transformers.FlaxLogitsProcessor` or This class can be used to create a list of [`FlaxLogitsProcessor`] or
:class:`~transformers.FlaxLogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits [`FlaxLogitsWarper`] to subsequently process a `scores` input tensor. This class inherits
from list and adds a specific `__call__` method to apply each :class:`~transformers.FlaxLogitsProcessor` or from list and adds a specific *__call__* method to apply each [`FlaxLogitsProcessor`] or
:class:`~transformers.FlaxLogitsWarper` to the inputs. [`FlaxLogitsWarper`] to the inputs.
""" """
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
...@@ -97,10 +97,10 @@ class FlaxLogitsProcessorList(list): ...@@ -97,10 +97,10 @@ class FlaxLogitsProcessorList(list):
class FlaxTemperatureLogitsWarper(FlaxLogitsWarper): class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
r""" r"""
:class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution). [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
Args: Args:
temperature (:obj:`float`): temperature (`float`):
The value used to module the logits distribution. The value used to module the logits distribution.
""" """
...@@ -117,16 +117,16 @@ class FlaxTemperatureLogitsWarper(FlaxLogitsWarper): ...@@ -117,16 +117,16 @@ class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
class FlaxTopPLogitsWarper(FlaxLogitsWarper): class FlaxTopPLogitsWarper(FlaxLogitsWarper):
""" """
:class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
prob_cut_off. prob_cut_off.
Args: Args:
top_p (:obj:`float`): top_p (`float`):
If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
kept for generation. kept for generation.
filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value. All filtered values will be set to this float value.
min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered. Minimum number of tokens that cannot be filtered.
""" """
...@@ -159,14 +159,14 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper): ...@@ -159,14 +159,14 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
class FlaxTopKLogitsWarper(FlaxLogitsWarper): class FlaxTopKLogitsWarper(FlaxLogitsWarper):
r""" r"""
:class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements. [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args: Args:
top_k (:obj:`int`): top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering. The number of highest probability vocabulary tokens to keep for top-k-filtering.
filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value. All filtered values will be set to this float value.
min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered. Minimum number of tokens that cannot be filtered.
""" """
...@@ -195,10 +195,10 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper): ...@@ -195,10 +195,10 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor): class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
r""" r"""
:class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the first generated token. [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
Args: Args:
bos_token_id (:obj:`int`): bos_token_id (`int`):
The id of the token to force as the first generated token. The id of the token to force as the first generated token.
""" """
...@@ -219,14 +219,14 @@ class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor): ...@@ -219,14 +219,14 @@ class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor): class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
r""" r"""
:class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the last generated token when [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when
:obj:`max_length` is reached. `max_length` is reached.
Args: Args:
max_length (:obj:`int`): max_length (`int`):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
eos_token_id (:obj:`int`): eos_token_id (`int`):
The id of the token to force as the last generated token when :obj:`max_length` is reached. The id of the token to force as the last generated token when `max_length` is reached.
""" """
def __init__(self, max_length: int, eos_token_id: int): def __init__(self, max_length: int, eos_token_id: int):
...@@ -247,13 +247,13 @@ class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor): ...@@ -247,13 +247,13 @@ class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor): class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
r""" r"""
:class:`transformers.FlaxLogitsProcessor` enforcing a min-length by setting EOS probability to 0. [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args: Args:
min_length (:obj:`int`): min_length (`int`):
The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (:obj:`int`): eos_token_id (`int`):
The id of the `end-of-sequence` token. The id of the *end-of-sequence* token.
""" """
def __init__(self, min_length: int, eos_token_id: int): def __init__(self, min_length: int, eos_token_id: int):
......
...@@ -48,7 +48,7 @@ class FlaxGreedySearchOutput(ModelOutput): ...@@ -48,7 +48,7 @@ class FlaxGreedySearchOutput(ModelOutput):
Args: Args:
sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`): sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences. The generated sequences.
""" """
...@@ -62,7 +62,7 @@ class FlaxSampleOutput(ModelOutput): ...@@ -62,7 +62,7 @@ class FlaxSampleOutput(ModelOutput):
Args: Args:
sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`): sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences. The generated sequences.
""" """
...@@ -76,9 +76,9 @@ class FlaxBeamSearchOutput(ModelOutput): ...@@ -76,9 +76,9 @@ class FlaxBeamSearchOutput(ModelOutput):
Args: Args:
sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`): sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences. The generated sequences.
scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size,)`): scores (`jnp.ndarray` of shape `(batch_size,)`):
The scores (log probabilites) of the generated sequences. The scores (log probabilites) of the generated sequences.
""" """
...@@ -119,7 +119,7 @@ class BeamSearchState: ...@@ -119,7 +119,7 @@ class BeamSearchState:
class FlaxGenerationMixin: class FlaxGenerationMixin:
""" """
A class containing all of the functions supporting generation, to be used as a mixin in A class containing all of the functions supporting generation, to be used as a mixin in
:class:`~transformers.FlaxPreTrainedModel`. [`FlaxPreTrainedModel`].
""" """
@staticmethod @staticmethod
...@@ -149,7 +149,7 @@ class FlaxGenerationMixin: ...@@ -149,7 +149,7 @@ class FlaxGenerationMixin:
""" """
This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
search behavior. Note that the only model that overwrites this method is search behavior. Note that the only model that overwrites this method is
:class:`~transformes.FlaxMarianMTModel`. [`~transformes.FlaxMarianMTModel`].
""" """
return logits return logits
...@@ -181,61 +181,62 @@ class FlaxGenerationMixin: ...@@ -181,61 +181,62 @@ class FlaxGenerationMixin:
Generates sequences for models with a language modeling head. The method currently supports greedy decoding, Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
and, multinomial sampling. and, multinomial sampling.
Apart from :obj:`input_ids`, all the arguments below will default to the value of the attribute of the same Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same
name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the name inside the [`PretrainedConfig`] of the model. The default values indicated are the
default values of those config. default values of those config.
Most of these parameters are explained in more detail in `this blog post Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
<https://huggingface.co/blog/how-to-generate>`__.
Parameters: Parameters:
input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`): input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
max_length (:obj:`int`, `optional`, defaults to 20): max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`): do_sample (`bool`, *optional*, defaults to `False`):
Whether or not to use sampling ; use greedy decoding otherwise. Whether or not to use sampling ; use greedy decoding otherwise.
temperature (:obj:`float`, `optional`, defaults to 1.0): temperature (`float`, *optional*, defaults to 1.0):
The value used to module the next token probabilities. The value used to module the next token probabilities.
top_k (:obj:`int`, `optional`, defaults to 50): top_k (`int`, *optional*, defaults to 50):
The number of highest probability vocabulary tokens to keep for top-k-filtering. The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p (:obj:`float`, `optional`, defaults to 1.0): top_p (`float`, *optional*, defaults to 1.0):
If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation. higher are kept for generation.
pad_token_id (:obj:`int`, `optional`): pad_token_id (`int`, *optional*):
The id of the `padding` token. The id of the *padding* token.
bos_token_id (:obj:`int`, `optional`): bos_token_id (`int`, *optional*):
The id of the `beginning-of-sequence` token. The id of the *beginning-of-sequence* token.
eos_token_id (:obj:`int`, `optional`): eos_token_id (`int`, *optional*):
The id of the `end-of-sequence` token. The id of the *end-of-sequence* token.
num_beams (:obj:`int`, `optional`, defaults to 1): num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search. Number of beams for beam search. 1 means no beam search.
decoder_start_token_id (:obj:`int`, `optional`): decoder_start_token_id (`int`, *optional*):
If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token. If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
trace (:obj:`bool`, `optional`, defaults to :obj:`True`): trace (`bool`, *optional*, defaults to `True`):
Whether to trace generation. Setting ``trace=False`` should only be used for debugging and will lead to Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to
a considerably slower runtime. a considerably slower runtime.
params (:obj:`Dict[str, jnp.ndarray]`, `optional`): params (`Dict[str, jnp.ndarray]`, *optional*):
Optionally the model parameters can be passed. Can be useful for parallelized generation. Optionally the model parameters can be passed. Can be useful for parallelized generation.
model_kwargs: model_kwargs:
Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. Additional model specific kwargs will be forwarded to the `forward` function of the model.
Return: Return:
:class:`~transformers.file_utils.ModelOutput`. [`~file_utils.ModelOutput`].
Examples:: Examples:
>>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
```python
>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
>>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
>>> input_context = "The dog" >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
>>> # encode input context >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
>>> input_ids = tokenizer(input_context, return_tensors="np").input_ids >>> input_context = "The dog"
>>> # generate candidates using sampling >>> # encode input context
>>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True) >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
>>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) >>> # generate candidates using sampling
""" >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
>>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
```"""
# set init values # set init values
max_length = max_length if max_length is not None else self.config.max_length max_length = max_length if max_length is not None else self.config.max_length
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
...@@ -326,8 +327,8 @@ class FlaxGenerationMixin: ...@@ -326,8 +327,8 @@ class FlaxGenerationMixin:
self, top_k: int = None, top_p: float = None, temperature: float = None self, top_k: int = None, top_p: float = None, temperature: float = None
) -> FlaxLogitsProcessorList: ) -> FlaxLogitsProcessorList:
""" """
This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
:class:`~transformers.FlaxLogitsWarper` instances used for multinomial sampling. [`FlaxLogitsWarper`] instances used for multinomial sampling.
""" """
# init warp parameters # init warp parameters
...@@ -358,8 +359,8 @@ class FlaxGenerationMixin: ...@@ -358,8 +359,8 @@ class FlaxGenerationMixin:
forced_eos_token_id: int, forced_eos_token_id: int,
) -> FlaxLogitsProcessorList: ) -> FlaxLogitsProcessorList:
""" """
This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
:class:`~transformers.FlaxLogitsProcessor` instances used to modify the scores of the language model head. [`FlaxLogitsProcessor`] instances used to modify the scores of the language model head.
""" """
processors = FlaxLogitsProcessorList() processors = FlaxLogitsProcessorList()
......
...@@ -30,22 +30,22 @@ logger = get_logger(__name__) ...@@ -30,22 +30,22 @@ logger = get_logger(__name__)
LOGITS_PROCESSOR_INPUTS_DOCSTRING = r""" LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`. See Indices can be obtained using [`BertTokenizer`]. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details. details.
`What are input IDs? <../glossary.html#input-ids>`__ [What are input IDs?](../glossary#input-ids)
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search search or log softmax for each vocabulary token when using beam search
kwargs: kwargs:
Additional logits processor specific kwargs. Additional logits processor specific kwargs.
Return: Return:
:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores. `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
""" """
...@@ -74,10 +74,10 @@ class LogitsWarper(ABC): ...@@ -74,10 +74,10 @@ class LogitsWarper(ABC):
class LogitsProcessorList(list): class LogitsProcessorList(list):
""" """
This class can be used to create a list of :class:`~transformers.LogitsProcessor` or This class can be used to create a list of [`LogitsProcessor`] or
:class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from [`LogitsWarper`] to subsequently process a `scores` input tensor. This class inherits from
list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or list and adds a specific *__call__* method to apply each [`LogitsProcessor`] or
:class:`~transformers.LogitsWarper` to the inputs. [`LogitsWarper`] to the inputs.
""" """
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
...@@ -98,13 +98,13 @@ class LogitsProcessorList(list): ...@@ -98,13 +98,13 @@ class LogitsProcessorList(list):
class MinLengthLogitsProcessor(LogitsProcessor): class MinLengthLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0. [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args: Args:
min_length (:obj:`int`): min_length (`int`):
The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (:obj:`int`): eos_token_id (`int`):
The id of the `end-of-sequence` token. The id of the *end-of-sequence* token.
""" """
def __init__(self, min_length: int, eos_token_id: int): def __init__(self, min_length: int, eos_token_id: int):
...@@ -126,10 +126,10 @@ class MinLengthLogitsProcessor(LogitsProcessor): ...@@ -126,10 +126,10 @@ class MinLengthLogitsProcessor(LogitsProcessor):
class TemperatureLogitsWarper(LogitsWarper): class TemperatureLogitsWarper(LogitsWarper):
r""" r"""
:class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution). [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
Args: Args:
temperature (:obj:`float`): temperature (`float`):
The value used to module the logits distribution. The value used to module the logits distribution.
""" """
...@@ -146,12 +146,11 @@ class TemperatureLogitsWarper(LogitsWarper): ...@@ -146,12 +146,11 @@ class TemperatureLogitsWarper(LogitsWarper):
class RepetitionPenaltyLogitsProcessor(LogitsProcessor): class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences. [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
Args: Args:
repetition_penalty (:obj:`float`): repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See `this paper The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
<https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
""" """
def __init__(self, penalty: float): def __init__(self, penalty: float):
...@@ -172,16 +171,16 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor): ...@@ -172,16 +171,16 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
class TopPLogitsWarper(LogitsWarper): class TopPLogitsWarper(LogitsWarper):
""" """
:class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
prob_cut_off. prob_cut_off.
Args: Args:
top_p (:obj:`float`): top_p (`float`):
If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
kept for generation. kept for generation.
filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value. All filtered values will be set to this float value.
min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered. Minimum number of tokens that cannot be filtered.
""" """
...@@ -215,14 +214,14 @@ class TopPLogitsWarper(LogitsWarper): ...@@ -215,14 +214,14 @@ class TopPLogitsWarper(LogitsWarper):
class TopKLogitsWarper(LogitsWarper): class TopKLogitsWarper(LogitsWarper):
r""" r"""
:class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements. [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args: Args:
top_k (:obj:`int`): top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering. The number of highest probability vocabulary tokens to keep for top-k-filtering.
filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value. All filtered values will be set to this float value.
min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered. Minimum number of tokens that cannot be filtered.
""" """
...@@ -279,12 +278,11 @@ def _calc_banned_ngram_tokens( ...@@ -279,12 +278,11 @@ def _calc_banned_ngram_tokens(
class NoRepeatNGramLogitsProcessor(LogitsProcessor): class NoRepeatNGramLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq [`LogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
<https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
Args: Args:
ngram_size (:obj:`int`): ngram_size (`int`):
All ngrams of size :obj:`ngram_size` can only occur once. All ngrams of size `ngram_size` can only occur once.
""" """
def __init__(self, ngram_size: int): def __init__(self, ngram_size: int):
...@@ -305,13 +303,13 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor): ...@@ -305,13 +303,13 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor): class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids. [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids.
See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__. See [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
Args: Args:
encoder_ngram_size (:obj:`int`): encoder_ngram_size (`int`):
All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids. All ngrams of size `ngram_size` can only occur within the encoder input ids.
encoder_input_ids (:obj:`int`): encoder_input_ids (`int`):
The encoder_input_ids that should not be repeated within the decoder ids. The encoder_input_ids that should not be repeated within the decoder ids.
""" """
...@@ -346,15 +344,14 @@ class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor): ...@@ -346,15 +344,14 @@ class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
class NoBadWordsLogitsProcessor(LogitsProcessor): class NoBadWordsLogitsProcessor(LogitsProcessor):
""" """
:class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled. [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
Args: Args:
bad_words_ids (:obj:`List[List[int]]`): bad_words_ids (`List[List[int]]`):
List of list of token ids that are not allowed to be generated. In order to get the tokens of the words List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
that should not appear in the generated text, use :obj:`tokenizer(bad_word, that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
add_prefix_space=True).input_ids`. eos_token_id (`int`):
eos_token_id (:obj:`int`): The id of the *end-of-sequence* token.
The id of the `end-of-sequence` token.
""" """
def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
...@@ -474,16 +471,16 @@ class NoBadWordsLogitsProcessor(LogitsProcessor): ...@@ -474,16 +471,16 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
class PrefixConstrainedLogitsProcessor(LogitsProcessor): class PrefixConstrainedLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned
constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more constrained generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more
information. information.
Args: Args:
prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`): prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
This function constraints the beam search to allowed tokens only at each step. This function takes 2 This function constraints the beam search to allowed tokens only at each step. This function takes 2
arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed
tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and
the batch ID :obj:`batch_id`. the batch ID `batch_id`.
""" """
def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int): def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
...@@ -501,20 +498,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor): ...@@ -501,20 +498,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
class HammingDiversityLogitsProcessor(LogitsProcessor): class HammingDiversityLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only
effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse
Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
Args: Args:
diversity_penalty (:obj:`float`): diversity_penalty (`float`):
This value is subtracted from a beam's score if it generates a token same as any beam from other group at a This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled. particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
num_beams (:obj:`int`): num_beams (`int`):
Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for
more details. more details.
num_beam_groups (:obj:`int`): num_beam_groups (`int`):
Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
""" """
def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int): def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
...@@ -561,10 +558,10 @@ class HammingDiversityLogitsProcessor(LogitsProcessor): ...@@ -561,10 +558,10 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
class ForcedBOSTokenLogitsProcessor(LogitsProcessor): class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token. [`LogitsProcessor`] that enforces the specified token as the first generated token.
Args: Args:
bos_token_id (:obj:`int`): bos_token_id (`int`):
The id of the token to force as the first generated token. The id of the token to force as the first generated token.
""" """
...@@ -582,14 +579,14 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor): ...@@ -582,14 +579,14 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
class ForcedEOSTokenLogitsProcessor(LogitsProcessor): class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when [`LogitsProcessor`] that enforces the specified token as the last generated token when
:obj:`max_length` is reached. `max_length` is reached.
Args: Args:
max_length (:obj:`int`): max_length (`int`):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
eos_token_id (:obj:`int`): eos_token_id (`int`):
The id of the token to force as the last generated token when :obj:`max_length` is reached. The id of the token to force as the last generated token when `max_length` is reached.
""" """
def __init__(self, max_length: int, eos_token_id: int): def __init__(self, max_length: int, eos_token_id: int):
...@@ -607,9 +604,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor): ...@@ -607,9 +604,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
class InfNanRemoveLogitsProcessor(LogitsProcessor): class InfNanRemoveLogitsProcessor(LogitsProcessor):
r""" r"""
:class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation
method to fail. Note that using the logits processor should only be used if necessary since it can slow down the method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
generation method. :obj:`max_length` is reached. generation method. `max_length` is reached.
""" """
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
......
...@@ -11,22 +11,22 @@ from .file_utils import add_start_docstrings ...@@ -11,22 +11,22 @@ from .file_utils import add_start_docstrings
STOPPING_CRITERIA_INPUTS_DOCSTRING = r""" STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`. See Indices can be obtained using [`BertTokenizer`]. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details. details.
`What are input IDs? <../glossary.html#input-ids>`__ [What are input IDs?](../glossary#input-ids)
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
or scores for each vocabulary token after SoftMax. or scores for each vocabulary token after SoftMax.
kwargs: kwargs:
Additional stopping criteria specific kwargs. Additional stopping criteria specific kwargs.
Return: Return:
:obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop. `bool`. `False` indicates we should continue, `True` indicates we should stop.
""" """
...@@ -41,11 +41,11 @@ class StoppingCriteria(ABC): ...@@ -41,11 +41,11 @@ class StoppingCriteria(ABC):
class MaxLengthCriteria(StoppingCriteria): class MaxLengthCriteria(StoppingCriteria):
""" """
This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`. This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`.
Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens. Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
Args: Args:
max_length (:obj:`int`): max_length (`int`):
The maximum length that the output sequence can have in number of tokens. The maximum length that the output sequence can have in number of tokens.
""" """
...@@ -59,14 +59,14 @@ class MaxLengthCriteria(StoppingCriteria): ...@@ -59,14 +59,14 @@ class MaxLengthCriteria(StoppingCriteria):
class MaxNewTokensCriteria(StoppingCriteria): class MaxNewTokensCriteria(StoppingCriteria):
""" """
This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`. This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`.
Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens. very close to `MaxLengthCriteria` but ignores the number of initial tokens.
Args: Args:
start_length (:obj:`int`): start_length (`int`):
The number of initial tokens. The number of initial tokens.
max_new_tokens (:obj:`int`): max_new_tokens (`int`):
The maximum number of tokens to generate. The maximum number of tokens to generate.
""" """
...@@ -90,12 +90,12 @@ class MaxTimeCriteria(StoppingCriteria): ...@@ -90,12 +90,12 @@ class MaxTimeCriteria(StoppingCriteria):
""" """
This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
time will start being counted when you initialize this function. You can override this by passing an time will start being counted when you initialize this function. You can override this by passing an
:obj:`initial_time`. `initial_time`.
Args: Args:
max_time (:obj:`float`): max_time (`float`):
The maximum allowed time in seconds for the generation. The maximum allowed time in seconds for the generation.
initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`): initial_time (`float`, *optional*, defaults to `time.time()`):
The start of the generation allowed time. The start of the generation allowed time.
""" """
......
This diff is collapsed.
This diff is collapsed.
...@@ -41,14 +41,14 @@ def is_torch_tensor(obj): ...@@ -41,14 +41,14 @@ def is_torch_tensor(obj):
def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image": def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
""" """
Loads :obj:`image` to a PIL Image. Loads `image` to a PIL Image.
Args: Args:
image (:obj:`str` or :obj:`PIL.Image.Image`): image (`str` or `PIL.Image.Image`):
The image to convert to the PIL Image format. The image to convert to the PIL Image format.
Returns: Returns:
:obj:`PIL.Image.Image`: A PIL Image. `PIL.Image.Image`: A PIL Image.
""" """
if isinstance(image, str): if isinstance(image, str):
if image.startswith("http://") or image.startswith("https://"): if image.startswith("http://") or image.startswith("https://"):
...@@ -87,15 +87,15 @@ class ImageFeatureExtractionMixin: ...@@ -87,15 +87,15 @@ class ImageFeatureExtractionMixin:
def to_pil_image(self, image, rescale=None): def to_pil_image(self, image, rescale=None):
""" """
Converts :obj:`image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
axis if needed. axis if needed.
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`numpy.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
The image to convert to the PIL Image format. The image to convert to the PIL Image format.
rescale (:obj:`bool`, `optional`): rescale (`bool`, *optional*):
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
default to :obj:`True` if the image type is a floating type, :obj:`False` otherwise. default to `True` if the image type is a floating type, `False` otherwise.
""" """
self._ensure_format_supported(image) self._ensure_format_supported(image)
...@@ -117,17 +117,17 @@ class ImageFeatureExtractionMixin: ...@@ -117,17 +117,17 @@ class ImageFeatureExtractionMixin:
def to_numpy_array(self, image, rescale=None, channel_first=True): def to_numpy_array(self, image, rescale=None, channel_first=True):
""" """
Converts :obj:`image` to a numpy array. Optionally rescales it and puts the channel dimension as the first Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
dimension. dimension.
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to convert to a NumPy array. The image to convert to a NumPy array.
rescale (:obj:`bool`, `optional`): rescale (`bool`, *optional*):
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
default to :obj:`True` if the image is a PIL Image or an array/tensor of integers, :obj:`False` default to `True` if the image is a PIL Image or an array/tensor of integers, `False`
otherwise. otherwise.
channel_first (:obj:`bool`, `optional`, defaults to :obj:`True`): channel_first (`bool`, *optional*, defaults to `True`):
Whether or not to permute the dimensions of the image to put the channel dimension first. Whether or not to permute the dimensions of the image to put the channel dimension first.
""" """
self._ensure_format_supported(image) self._ensure_format_supported(image)
...@@ -151,15 +151,15 @@ class ImageFeatureExtractionMixin: ...@@ -151,15 +151,15 @@ class ImageFeatureExtractionMixin:
def normalize(self, image, mean, std): def normalize(self, image, mean, std):
""" """
Normalizes :obj:`image` with :obj:`mean` and :obj:`std`. Note that this will trigger a conversion of Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of
:obj:`image` to a NumPy array if it's a PIL Image. `image` to a NumPy array if it's a PIL Image.
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to normalize. The image to normalize.
mean (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`): mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
The mean (per channel) to use for normalization. The mean (per channel) to use for normalization.
std (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`): std (`List[float]` or `np.ndarray` or `torch.Tensor`):
The standard deviation (per channel) to use for normalization. The standard deviation (per channel) to use for normalization.
""" """
self._ensure_format_supported(image) self._ensure_format_supported(image)
...@@ -187,14 +187,14 @@ class ImageFeatureExtractionMixin: ...@@ -187,14 +187,14 @@ class ImageFeatureExtractionMixin:
def resize(self, image, size, resample=PIL.Image.BILINEAR): def resize(self, image, size, resample=PIL.Image.BILINEAR):
""" """
Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image. Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize. The image to resize.
size (:obj:`int` or :obj:`Tuple[int, int]`): size (`int` or `Tuple[int, int]`):
The size to use for resizing the image. The size to use for resizing the image.
resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`): resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
The filter to user for resampling. The filter to user for resampling.
""" """
self._ensure_format_supported(image) self._ensure_format_supported(image)
...@@ -210,13 +210,13 @@ class ImageFeatureExtractionMixin: ...@@ -210,13 +210,13 @@ class ImageFeatureExtractionMixin:
def center_crop(self, image, size): def center_crop(self, image, size):
""" """
Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
the size given, it will be padded (so the returned result has the size asked). the size given, it will be padded (so the returned result has the size asked).
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize. The image to resize.
size (:obj:`int` or :obj:`Tuple[int, int]`): size (`int` or `Tuple[int, int]`):
The size to which crop the image. The size to which crop the image.
""" """
self._ensure_format_supported(image) self._ensure_format_supported(image)
......
...@@ -264,11 +264,11 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR ...@@ -264,11 +264,11 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
@functools.wraps(trainable) @functools.wraps(trainable)
def dynamic_modules_import_trainable(*args, **kwargs): def dynamic_modules_import_trainable(*args, **kwargs):
""" """
Wrapper around ``tune.with_parameters`` to ensure datasets_modules are loaded on each Actor. Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565. Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
Assumes that ``_objective``, defined above, is a function. Assumes that `_objective`, defined above, is a function.
""" """
if is_datasets_available(): if is_datasets_available():
import datasets.load import datasets.load
...@@ -372,11 +372,10 @@ def rewrite_logs(d): ...@@ -372,11 +372,10 @@ def rewrite_logs(d):
class TensorBoardCallback(TrainerCallback): class TensorBoardCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
<https://www.tensorflow.org/tensorboard>`__.
Args: Args:
tb_writer (:obj:`SummaryWriter`, `optional`): tb_writer (`SummaryWriter`, *optional*):
The writer to use. Will instantiate one if not set. The writer to use. Will instantiate one if not set.
""" """
...@@ -461,7 +460,7 @@ class TensorBoardCallback(TrainerCallback): ...@@ -461,7 +460,7 @@ class TensorBoardCallback(TrainerCallback):
class WandbCallback(TrainerCallback): class WandbCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases <https://www.wandb.com/>`__. A [`TrainerCallback`] that sends the logs to [Weight and Biases](https://www.wandb.com/).
""" """
def __init__(self): def __init__(self):
...@@ -478,22 +477,21 @@ class WandbCallback(TrainerCallback): ...@@ -478,22 +477,21 @@ class WandbCallback(TrainerCallback):
def setup(self, args, state, model, **kwargs): def setup(self, args, state, model, **kwargs):
""" """
Setup the optional Weights & Biases (`wandb`) integration. Setup the optional Weights & Biases (*wandb*) integration.
One can subclass and override this method to customize the setup if needed. Find more information `here One can subclass and override this method to customize the setup if needed. Find more information [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment variables:
<https://docs.wandb.ai/integrations/huggingface>`__. You can also override the following environment variables:
Environment: Environment:
WANDB_LOG_MODEL (:obj:`bool`, `optional`, defaults to :obj:`False`): WANDB_LOG_MODEL (`bool`, *optional*, defaults to `False`):
Whether or not to log model as artifact at the end of training. Use along with Whether or not to log model as artifact at the end of training. Use along with
`TrainingArguments.load_best_model_at_end` to upload best model. *TrainingArguments.load_best_model_at_end* to upload best model.
WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`): WANDB_WATCH (`str`, *optional* defaults to `"gradients"`):
Can be :obj:`"gradients"`, :obj:`"all"` or :obj:`"false"`. Set to :obj:`"false"` to disable gradient Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient
logging or :obj:`"all"` to log gradients and parameters. logging or `"all"` to log gradients and parameters.
WANDB_PROJECT (:obj:`str`, `optional`, defaults to :obj:`"huggingface"`): WANDB_PROJECT (`str`, *optional*, defaults to `"huggingface"`):
Set this to a custom string to store results in a different project. Set this to a custom string to store results in a different project.
WANDB_DISABLED (:obj:`bool`, `optional`, defaults to :obj:`False`): WANDB_DISABLED (`bool`, *optional*, defaults to `False`):
Whether or not to disable wandb entirely. Set `WANDB_DISABLED=true` to disable. Whether or not to disable wandb entirely. Set *WANDB_DISABLED=true* to disable.
""" """
if self._wandb is None: if self._wandb is None:
return return
...@@ -585,7 +583,7 @@ class WandbCallback(TrainerCallback): ...@@ -585,7 +583,7 @@ class WandbCallback(TrainerCallback):
class CometCallback(TrainerCallback): class CometCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML <https://www.comet.ml/site/>`__. A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
""" """
def __init__(self): def __init__(self):
...@@ -599,19 +597,18 @@ class CometCallback(TrainerCallback): ...@@ -599,19 +597,18 @@ class CometCallback(TrainerCallback):
Setup the optional Comet.ml integration. Setup the optional Comet.ml integration.
Environment: Environment:
COMET_MODE (:obj:`str`, `optional`): COMET_MODE (`str`, *optional*):
Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE", Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE",
or "DISABLED". Defaults to "ONLINE". or "DISABLED". Defaults to "ONLINE".
COMET_PROJECT_NAME (:obj:`str`, `optional`): COMET_PROJECT_NAME (`str`, *optional*):
Comet project name for experiments Comet project name for experiments
COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`): COMET_OFFLINE_DIRECTORY (`str`, *optional*):
Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE" Folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
COMET_LOG_ASSETS (:obj:`str`, `optional`): COMET_LOG_ASSETS (`str`, *optional*):
Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
"FALSE". Defaults to "TRUE". "FALSE". Defaults to "TRUE".
For a number of configurable items in the environment, see `here For a number of configurable items in the environment, see [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
<https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
""" """
self._initialized = True self._initialized = True
log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper() log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
...@@ -661,8 +658,7 @@ class CometCallback(TrainerCallback): ...@@ -661,8 +658,7 @@ class CometCallback(TrainerCallback):
class AzureMLCallback(TrainerCallback): class AzureMLCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `AzureML A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
<https://pypi.org/project/azureml-sdk/>`__.
""" """
def __init__(self, azureml_run=None): def __init__(self, azureml_run=None):
...@@ -685,7 +681,7 @@ class AzureMLCallback(TrainerCallback): ...@@ -685,7 +681,7 @@ class AzureMLCallback(TrainerCallback):
class MLflowCallback(TrainerCallback): class MLflowCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow <https://www.mlflow.org/>`__. A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/).
""" """
def __init__(self): def __init__(self):
...@@ -705,11 +701,11 @@ class MLflowCallback(TrainerCallback): ...@@ -705,11 +701,11 @@ class MLflowCallback(TrainerCallback):
Setup the optional MLflow integration. Setup the optional MLflow integration.
Environment: Environment:
HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`): HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
Whether to use MLflow .log_artifact() facility to log artifacts. Whether to use MLflow .log_artifact() facility to log artifacts.
This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
whatever is in :class:`~transformers.TrainingArguments`'s ``output_dir`` to the local or remote whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote
artifact storage. Using it without a remote storage will just copy the files to your artifact location. artifact storage. Using it without a remote storage will just copy the files to your artifact location.
""" """
log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
...@@ -774,7 +770,7 @@ class MLflowCallback(TrainerCallback): ...@@ -774,7 +770,7 @@ class MLflowCallback(TrainerCallback):
class NeptuneCallback(TrainerCallback): class NeptuneCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that sends the logs to `Neptune <https://neptune.ai>`. A [`TrainerCallback`] that sends the logs to *Neptune <https://neptune.ai>*.
""" """
def __init__(self): def __init__(self):
...@@ -793,13 +789,13 @@ class NeptuneCallback(TrainerCallback): ...@@ -793,13 +789,13 @@ class NeptuneCallback(TrainerCallback):
Setup the Neptune integration. Setup the Neptune integration.
Environment: Environment:
NEPTUNE_PROJECT (:obj:`str`, `required`): NEPTUNE_PROJECT (`str`, *required*):
The project ID for neptune.ai account. Should be in format `workspace_name/project_name` The project ID for neptune.ai account. Should be in format *workspace_name/project_name*
NEPTUNE_API_TOKEN (:obj:`str`, `required`): NEPTUNE_API_TOKEN (`str`, *required*):
API-token for neptune.ai account API-token for neptune.ai account
NEPTUNE_CONNECTION_MODE (:obj:`str`, `optional`): NEPTUNE_CONNECTION_MODE (`str`, *optional*):
Neptune connection mode. `async` by default Neptune connection mode. *async* by default
NEPTUNE_RUN_NAME (:obj:`str`, `optional`): NEPTUNE_RUN_NAME (`str`, *optional*):
The name of run process on Neptune dashboard The name of run process on Neptune dashboard
""" """
if state.is_world_process_zero: if state.is_world_process_zero:
...@@ -831,7 +827,7 @@ class NeptuneCallback(TrainerCallback): ...@@ -831,7 +827,7 @@ class NeptuneCallback(TrainerCallback):
def __del__(self): def __del__(self):
""" """
Environment: Environment:
NEPTUNE_STOP_TIMEOUT (:obj:`int`, `optional`): NEPTUNE_STOP_TIMEOUT (`int`, *optional*):
Number of seconsds to wait for all Neptune.ai tracking calls to finish, before stopping the tracked Number of seconsds to wait for all Neptune.ai tracking calls to finish, before stopping the tracked
run. If not set it will wait for all tracking calls to finish. run. If not set it will wait for all tracking calls to finish.
""" """
...@@ -845,7 +841,7 @@ class NeptuneCallback(TrainerCallback): ...@@ -845,7 +841,7 @@ class NeptuneCallback(TrainerCallback):
class CodeCarbonCallback(TrainerCallback): class CodeCarbonCallback(TrainerCallback):
""" """
A :class:`~transformers.TrainerCallback` that tracks the CO2 emission of training. A [`TrainerCallback`] that tracks the CO2 emission of training.
""" """
def __init__(self): def __init__(self):
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment