"vscode:/vscode.git/clone" did not exist on "bc80f8bc37edb30413cfe106f0076570aa65284d"
Unverified Commit 27b3031d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers

* Processors and fixes

* Last modeling files and fixes to models

* Pipeline modules

* Utils files

* Data submodule

* All the other files

* Style

* Missing examples

* Style again

* Fix copies

* Say bye bye to rst docstrings forever
parent 18587639
...@@ -23,44 +23,43 @@ logger = logging.get_logger(__name__) ...@@ -23,44 +23,43 @@ logger = logging.get_logger(__name__)
class MT5Config(PretrainedConfig): class MT5Config(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.MT5Model` or a This is the configuration class to store the configuration of a [`MT5Model`] or a
:class:`~transformers.TFMT5Model`. It is used to instantiate a mT5 model according to the specified arguments, [`TFMT5Model`]. It is used to instantiate a mT5 model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the mT5 `google/mt5-small <https://huggingface.co/google/mt5-small>`__ architecture. to that of the mT5 [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Arguments: Arguments:
vocab_size (:obj:`int`, `optional`, defaults to 250112): vocab_size (`int`, *optional*, defaults to 250112):
Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`. `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
d_model (:obj:`int`, `optional`, defaults to 512): d_model (`int`, *optional*, defaults to 512):
Size of the encoder layers and the pooler layer. Size of the encoder layers and the pooler layer.
d_kv (:obj:`int`, `optional`, defaults to 64): d_kv (`int`, *optional*, defaults to 64):
Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
// num_heads`. d_ff (`int`, *optional*, defaults to 1024):
d_ff (:obj:`int`, `optional`, defaults to 1024): Size of the intermediate feed forward layer in each `T5Block`.
Size of the intermediate feed forward layer in each :obj:`T5Block`. num_layers (`int`, *optional*, defaults to 8):
num_layers (:obj:`int`, `optional`, defaults to 8):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_decoder_layers (:obj:`int`, `optional`): num_decoder_layers (`int`, *optional*):
Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not
set. set.
num_heads (:obj:`int`, `optional`, defaults to 6): num_heads (`int`, *optional*, defaults to 6):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32): relative_attention_num_buckets (`int`, *optional*, defaults to 32):
The number of buckets to use for each attention layer. The number of buckets to use for each attention layer.
dropout_rate (:obj:`float`, `optional`, defaults to 0.1): dropout_rate (`float`, *optional*, defaults to 0.1):
The ratio for all dropout layers. The ratio for all dropout layers.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6): layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
initializer_factor (:obj:`float`, `optional`, defaults to 1): initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing). testing).
feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`): feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`. Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
""" """
model_type = "mt5" model_type = "mt5"
......
...@@ -26,91 +26,92 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.c ...@@ -26,91 +26,92 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.c
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel` or a This is the configuration class to store the configuration of a [`OpenAIGPTModel`] or a
:class:`~transformers.TFOpenAIGPTModel`. It is used to instantiate a GPT model according to the specified [`TFOpenAIGPTModel`]. It is used to instantiate a GPT model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
configuration to that of the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI. configuration to that of the [GPT](https://huggingface.co/openai-gpt) architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args: Args:
vocab_size (:obj:`int`, `optional`, defaults to 40478): vocab_size (`int`, *optional*, defaults to 40478):
Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.OpenAIGPTModel` or `inputs_ids` passed when calling [`OpenAIGPTModel`] or
:class:`~transformers.TFOpenAIGPTModel`. [`TFOpenAIGPTModel`].
n_positions (:obj:`int`, `optional`, defaults to 512): n_positions (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
n_embd (:obj:`int`, `optional`, defaults to 768): n_embd (`int`, *optional*, defaults to 768):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, `optional`, defaults to 12): n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, `optional`, defaults to 12): n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): afn (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, `optional`, defaults to 0.1): attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, `optional`, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): predict_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not special tokens should be predicted when the model has a language modeling head. Whether or not special tokens should be predicted when the model has a language modeling head.
summary_type (:obj:`str`, `optional`, defaults to :obj:`"cls_index"`): summary_type (`str`, *optional*, defaults to `"cls_index"`):
Argument used when doing sequence summary, used in the models Argument used when doing sequence summary, used in the models
:class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`. [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Has to be one of the following options: Has to be one of the following options:
- :obj:`"last"`: Take the last token hidden state (like XLNet). - `"last"`: Take the last token hidden state (like XLNet).
- :obj:`"first"`: Take the first token hidden state (like BERT). - `"first"`: Take the first token hidden state (like BERT).
- :obj:`"mean"`: Take the mean of all tokens hidden states. - `"mean"`: Take the mean of all tokens hidden states.
- :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- :obj:`"attn"`: Not implemented now, use multi-head attention. - `"attn"`: Not implemented now, use multi-head attention.
summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`): summary_use_proj (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary, used in the models Argument used when doing sequence summary, used in the models
:class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`. [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Whether or not to add a projection after the vector extraction. Whether or not to add a projection after the vector extraction.
summary_activation (:obj:`str`, `optional`): summary_activation (`str`, *optional*):
Argument used when doing sequence summary, used in the models Argument used when doing sequence summary, used in the models
:class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`. [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation. Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`): summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary, used in the models Argument used when doing sequence summary, used in the models
:class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`. [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes. Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1): summary_first_dropout (`float`, *optional*, defaults to 0.1):
Argument used when doing sequence summary, used in the models Argument used when doing sequence summary, used in the models
:class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`. [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
The dropout ratio to be used after the projection and activation. The dropout ratio to be used after the projection and activation.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
Examples:: Examples:
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel ```python
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> # Initializing a GPT configuration >>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig() >>> configuration = OpenAIGPTConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration) >>> model = OpenAIGPTModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" ```"""
model_type = "openai-gpt" model_type = "openai-gpt"
attribute_map = { attribute_map = {
......
...@@ -75,18 +75,18 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -75,18 +75,18 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities: Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
- lowercases all inputs, - lowercases all inputs,
- uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
:obj:`BasicTokenizer` if not. `BasicTokenizer` if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (:obj:`str`): merges_file (`str`):
Path to the merges file. Path to the merges file.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
""" """
......
...@@ -39,21 +39,21 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -39,21 +39,21 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
""" """
Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
the following peculiarities: the following peculiarities:
- lower case all inputs - lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization - uses BERT's BasicTokenizer for pre-BPE tokenization
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods. methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (:obj:`str`): merges_file (`str`):
Path to the merges file. Path to the merges file.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
""" """
......
...@@ -28,77 +28,77 @@ PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,77 +28,77 @@ PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class PegasusConfig(PretrainedConfig): class PegasusConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to
instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large configuration with the defaults will yield a similar configuration to that of the PEGASUS [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture.
<https://huggingface.co/google/pegasus-large>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args: Args:
vocab_size (:obj:`int`, `optional`, defaults to 50265): vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or `inputs_ids` passed when calling [`PegasusModel`] or
:class:`~transformers.TFPegasusModel`. [`TFPegasusModel`].
d_model (:obj:`int`, `optional`, defaults to 1024): d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer. Dimensionality of the layers and the pooler layer.
encoder_layers (:obj:`int`, `optional`, defaults to 12): encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers. Number of encoder layers.
decoder_layers (:obj:`int`, `optional`, defaults to 12): decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers. Number of decoder layers.
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16): encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16): decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder. Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1): dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (:obj:`float`, `optional`, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0): classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier. The dropout ratio for classifier.
max_position_embeddings (:obj:`int`, `optional`, defaults to 1024): max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
init_std (:obj:`float`, `optional`, defaults to 0.02): init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the `LayerDrop paper <see The LayerDrop probability for the encoder. See the [LayerDrop paper](see
https://arxiv.org/abs/1909.11556>`__ for more details. https://arxiv.org/abs/1909.11556) for more details.
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the `LayerDrop paper <see The LayerDrop probability for the decoder. See the [LayerDrop paper](see
https://arxiv.org/abs/1909.11556>`__ for more details. https://arxiv.org/abs/1909.11556) for more details.
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`): scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model). Scale embeddings by diving by sqrt(d_model).
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models) Whether or not the model should return the last key/values attentions (not used by all models)
forced_eos_token_id (:obj:`int`, `optional`, defaults to 1): forced_eos_token_id (`int`, *optional*, defaults to 1):
The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to The id of the token to force as the last generated token when `max_length` is reached. Usually set to
:obj:`eos_token_id`. `eos_token_id`.
Example:: Example:
>>> from transformers import PegasusModel, PegasusConfig ```python
>>> from transformers import PegasusModel, PegasusConfig
>>> # Initializing a PEGASUS google/pegasus-large style configuration >>> # Initializing a PEGASUS google/pegasus-large style configuration
>>> configuration = PegasusConfig() >>> configuration = PegasusConfig()
>>> # Initializing a model from the google/pegasus-large style configuration >>> # Initializing a model from the google/pegasus-large style configuration
>>> model = PegasusModel(configuration) >>> model = PegasusModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" ```"""
model_type = "pegasus" model_type = "pegasus"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
......
...@@ -989,17 +989,18 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel): ...@@ -989,17 +989,18 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration ```python
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
>>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large') >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
>>> text = "My friends are cool but they eat too many carbs." >>> text = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer(text, max_length=1024, return_tensors='np') >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
>>> encoder_outputs = model.encode(**inputs) >>> encoder_outputs = model.encode(**inputs)
""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -1054,23 +1055,24 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel): ...@@ -1054,23 +1055,24 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration ```python
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
>>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large') >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
>>> text = "My friends are cool but they eat too many carbs." >>> text = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer(text, max_length=1024, return_tensors='np') >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
>>> encoder_outputs = model.encode(**inputs) >>> encoder_outputs = model.encode(**inputs)
>>> decoder_start_token_id = model.config.decoder_start_token_id >>> decoder_start_token_id = model.config.decoder_start_token_id
>>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
>>> outputs = model.decode(decoder_input_ids, encoder_outputs) >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
>>> last_decoder_hidden_states = outputs.last_hidden_state >>> last_decoder_hidden_states = outputs.last_hidden_state
""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -1322,23 +1324,24 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel): ...@@ -1322,23 +1324,24 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration ```python
>>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
>>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large') >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
>>> text = "My friends are cool but they eat too many carbs." >>> text = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer(text, max_length=1024, return_tensors='np') >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
>>> encoder_outputs = model.encode(**inputs) >>> encoder_outputs = model.encode(**inputs)
>>> decoder_start_token_id = model.config.decoder_start_token_id >>> decoder_start_token_id = model.config.decoder_start_token_id
>>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
>>> outputs = model.decode(decoder_input_ids, encoder_outputs) >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
>>> logits = outputs.logits >>> logits = outputs.logits
""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
......
...@@ -1197,19 +1197,20 @@ class PegasusModel(PegasusPreTrainedModel): ...@@ -1197,19 +1197,20 @@ class PegasusModel(PegasusPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import PegasusTokenizer, PegasusModel ```python
>>> from transformers import PegasusTokenizer, PegasusModel
>>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large") >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
>>> model = PegasusModel.from_pretrained("google/pegasus-large") >>> model = PegasusModel.from_pretrained("google/pegasus-large")
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
......
...@@ -40,56 +40,57 @@ logger = logging.get_logger(__name__) ...@@ -40,56 +40,57 @@ logger = logging.get_logger(__name__)
class PegasusTokenizer(PreTrainedTokenizer): class PegasusTokenizer(PreTrainedTokenizer):
r""" r"""
Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__. Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer. contains the vocabulary necessary to instantiate a tokenizer.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token. The end of sequence token.
.. note:: <Tip>
When building a sequence using special tokens, this is not the token that is used for the end of When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`. sequence. The token used is the `sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
</Tip>
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`): mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
The token used for masking single token values. This is the token used when training this model with masked The token used for masking single token values. This is the token used when training this model with masked
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__. Summarization](https://arxiv.org/pdf/1912.08777.pdf).
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`): mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
The token used for masking whole target sentences. This is the token used when training this model with gap The token used for masking whole target sentences. This is the token used when training this model with gap
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__. Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
tokenizer tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
that uses the tokens 2 - 104 only for pretraining that uses the tokens 2 - 104 only for pretraining
sp_model_kwargs (:obj:`dict`, `optional`): sp_model_kwargs (`dict`, *optional*):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization. - `enable_sampling`: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed. - `nbest_size = {0,1}`: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results. - `nbest_size > 1`: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm. using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout. BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -252,22 +253,22 @@ class PegasusTokenizer(PreTrainedTokenizer): ...@@ -252,22 +253,22 @@ class PegasusTokenizer(PreTrainedTokenizer):
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence: and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:
- single sequence: ``X </s>`` - single sequence: `X </s>`
- pair of sequences: ``A B </s>`` (not intended use) - pair of sequences: `A B </s>` (not intended use)
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator. separator.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added. List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id] return token_ids_0 + [self.eos_token_id]
......
...@@ -51,43 +51,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -51,43 +51,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class PegasusTokenizerFast(PreTrainedTokenizerFast): class PegasusTokenizerFast(PreTrainedTokenizerFast):
r""" r"""
Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods. methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer. contains the vocabulary necessary to instantiate a tokenizer.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token. The end of sequence token.
.. note:: <Tip>
When building a sequence using special tokens, this is not the token that is used for the end of When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`. sequence. The token used is the `sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
</Tip>
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`): mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
The token used for masking single token values. This is the token used when training this model with masked The token used for masking single token values. This is the token used when training this model with masked
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__. Summarization](https://arxiv.org/pdf/1912.08777.pdf).
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`): mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
The token used for masking whole target sentences. This is the token used when training this model with gap The token used for masking whole target sentences. This is the token used when training this model with gap
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__. Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
tokenizer tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
that uses the tokens 2 - 104 only for pretraining that uses the tokens 2 - 104 only for pretraining
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -175,17 +176,17 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast): ...@@ -175,17 +176,17 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
""" """
Build model inputs from a sequence by adding eos to the end. no bos token is added to the front. Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
- single sequence: ``X </s>`` - single sequence: `X </s>`
- pair of sequences: ``A B </s>`` (not intended use) - pair of sequences: `A B </s>` (not intended use)
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id] return token_ids_0 + [self.eos_token_id]
......
...@@ -28,85 +28,86 @@ PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,85 +28,86 @@ PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class PerceiverConfig(PretrainedConfig): class PerceiverConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.PerceiverModel`. It is used This is the configuration class to store the configuration of a [`PerceiverModel`]. It is used
to instantiate an Perceiver model according to the specified arguments, defining the model architecture. to instantiate an Perceiver model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Perceiver Instantiating a configuration with the defaults will yield a similar configuration to that of the Perceiver
`deepmind/language-perceiver <https://huggingface.co/deepmind/language-perceiver>`__ architecture. [deepmind/language-perceiver](https://huggingface.co/deepmind/language-perceiver) architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args: Args:
num_latents (:obj:`int`, `optional`, defaults to 256): num_latents (`int`, *optional*, defaults to 256):
The number of latents. The number of latents.
d_latents (:obj:`int`, `optional`, defaults to 1280): d_latents (`int`, *optional*, defaults to 1280):
Dimension of the latent embeddings. Dimension of the latent embeddings.
d_model (:obj:`int`, `optional`, defaults to 768): d_model (`int`, *optional*, defaults to 768):
Dimension of the inputs. Should only be provided in case [`PerceiverTextPreprocessor`] is used or no Dimension of the inputs. Should only be provided in case [*PerceiverTextPreprocessor*] is used or no
preprocessor is provided. preprocessor is provided.
num_blocks (:obj:`int`, `optional`, defaults to 1): num_blocks (`int`, *optional*, defaults to 1):
Number of blocks in the Transformer encoder. Number of blocks in the Transformer encoder.
num_self_attends_per_block (:obj:`int`, `optional`, defaults to 26): num_self_attends_per_block (`int`, *optional*, defaults to 26):
The number of self-attention layers per block. The number of self-attention layers per block.
num_self_attention_heads (:obj:`int`, `optional`, defaults to 8): num_self_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each self-attention layer in the Transformer encoder. Number of attention heads for each self-attention layer in the Transformer encoder.
num_cross_attention_heads (:obj:`int`, `optional`, defaults to 8): num_cross_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each cross-attention layer in the Transformer encoder. Number of attention heads for each cross-attention layer in the Transformer encoder.
qk_channels (:obj:`int`, `optional`): qk_channels (`int`, *optional*):
Dimension to project the queries + keys before applying attention in the cross-attention and self-attention Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
layers of the encoder. Will default to preserving the dimension of the queries if not specified. layers of the encoder. Will default to preserving the dimension of the queries if not specified.
v_channels (:obj:`int`, `optional`): v_channels (`int`, *optional*):
Dimension to project the values before applying attention in the cross-attention and self-attention layers Dimension to project the values before applying attention in the cross-attention and self-attention layers
of the encoder. Will default to preserving the dimension of the queries if not specified. of the encoder. Will default to preserving the dimension of the queries if not specified.
cross_attention_shape_for_attention (:obj:`str`, `optional`, defaults to :obj:`'kv'`): cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder. Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
self_attention_widening_factor (:obj:`int`, `optional`, defaults to 1): self_attention_widening_factor (`int`, *optional*, defaults to 1):
Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder. Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
cross_attention_widening_factor (:obj:`int`, `optional`, defaults to 1): cross_attention_widening_factor (`int`, *optional*, defaults to 1):
Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder. Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (:obj:`float`, `optional`, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
use_query_residual (:obj:`float`, `optional`, defaults to :obj:`True`): use_query_residual (`float`, *optional*, defaults to `True`):
Whether to add a query residual in the cross-attention layer of the encoder. Whether to add a query residual in the cross-attention layer of the encoder.
vocab_size (:obj:`int`, `optional`, defaults to 262): vocab_size (`int`, *optional*, defaults to 262):
Vocabulary size for the masked language modeling model. Vocabulary size for the masked language modeling model.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that the masked language modeling model might ever be used with. Typically set The maximum sequence length that the masked language modeling model might ever be used with. Typically set
this to something large just in case (e.g., 512 or 1024 or 2048). this to something large just in case (e.g., 512 or 1024 or 2048).
image_size (:obj:`int`, `optional`, defaults to 56): image_size (`int`, *optional*, defaults to 56):
Size of the images after preprocessing, for :class:`~transformers.PerceiverForImageClassificationLearned`. Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
train_size (:obj:`List[int]`, `optional`, defaults to [368, 496]): train_size (`List[int]`, *optional*, defaults to [368, 496]):
Training size of the images for the optical flow model. Training size of the images for the optical flow model.
num_frames (:obj:`int`, `optional`, defaults to 16): num_frames (`int`, *optional*, defaults to 16):
Number of video frames used for the multimodal autoencoding model. Number of video frames used for the multimodal autoencoding model.
audio_samples_per_frame (:obj:`int`, `optional`, defaults to 1920): audio_samples_per_frame (`int`, *optional*, defaults to 1920):
Number of audio samples per frame for the multimodal autoencoding model. Number of audio samples per frame for the multimodal autoencoding model.
samples_per_patch (:obj:`int`, `optional`, defaults to 16): samples_per_patch (`int`, *optional*, defaults to 16):
Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model. Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
output_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[1, 16, 224, 224]`): output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
autoencoding model. This excludes the channel dimension. autoencoding model. This excludes the channel dimension.
Example:: Example:
>>> from transformers import PerceiverModel, PerceiverConfig ```python
>>> from transformers import PerceiverModel, PerceiverConfig
>>> # Initializing a Perceiver deepmind/language-perceiver style configuration >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
>>> configuration = PerceiverConfig() >>> configuration = PerceiverConfig()
>>> # Initializing a model from the deepmind/language-perceiver style configuration >>> # Initializing a model from the deepmind/language-perceiver style configuration
>>> model = PerceiverModel(configuration) >>> model = PerceiverModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" ```"""
model_type = "perceiver" model_type = "perceiver"
def __init__( def __init__(
......
...@@ -38,31 +38,31 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi ...@@ -38,31 +38,31 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
r""" r"""
Constructs a Perceiver feature extractor. Constructs a Perceiver feature extractor.
This feature extractor inherits from :class:`~transformers.ImageFeatureExtractionMixin` which contains most of the This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the
main methods. Users should refer to this superclass for more information regarding those methods. main methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`): do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge, Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
the image is padded with 0's and then center cropped. the image is padded with 0's and then center cropped.
crop_size (:obj:`int`, `optional`, defaults to 256): crop_size (`int`, *optional*, defaults to 256):
Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
:obj:`True`. `True`.
do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`): do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain :obj:`size`. Whether to resize the input to a certain `size`.
size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224): size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize` integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
is set to :obj:`True`. is set to `True`.
resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`): resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`, An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
:obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`. `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
Only has an effect if :obj:`do_resize` is set to :obj:`True`. Only has an effect if `do_resize` is set to `True`.
do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`): do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`. Whether or not to normalize the input with `image_mean` and `image_std`.
image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`): image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images. The sequence of means for each channel, to be used when normalizing images.
image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`): image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images. The sequence of standard deviations for each channel, to be used when normalizing images.
""" """
...@@ -92,11 +92,11 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi ...@@ -92,11 +92,11 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
def center_crop(self, image): def center_crop(self, image):
""" """
Crops :obj:`image` to `self.crop_size` using a center crop. Note that if the image is too small to be cropped Crops `image` to *self.crop_size* using a center crop. Note that if the image is too small to be cropped
to the size given, it will be padded (so the returned result has the size asked). to the size given, it will be padded (so the returned result has the size asked).
Args: Args:
image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize. The image to resize.
""" """
...@@ -125,27 +125,29 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi ...@@ -125,27 +125,29 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
""" """
Main method to prepare for the model one or several image(s). Main method to prepare for the model one or several image(s).
.. warning:: <Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images. PIL images.
</Tip>
Args: Args:
images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`): images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width. number of channels, H and W are image height and width.
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`): return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are: If set, will return tensors of a particular framework. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. - `'tf'`: Return TensorFlow `tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects.
* :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects. - `'np'`: Return NumPy `np.ndarray` objects.
* :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects. - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns: Returns:
:class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height, - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width). width).
......
...@@ -765,83 +765,84 @@ class PerceiverModel(PerceiverPreTrainedModel): ...@@ -765,83 +765,84 @@ class PerceiverModel(PerceiverPreTrainedModel):
r""" r"""
Returns: Returns:
Examples:: Examples:
>>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel ```python
>>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
>>> import torch >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
>>> import requests >>> import torch
>>> from PIL import Image >>> import requests
>>> from PIL import Image
>>> # EXAMPLE 1: using the Perceiver to classify texts
>>> # - we define a TextPreprocessor, which can be used to embed tokens >>> # EXAMPLE 1: using the Perceiver to classify texts
>>> # - we define a ClassificationDecoder, which can be used to decode the >>> # - we define a TextPreprocessor, which can be used to embed tokens
>>> # final hidden states of the latents to classification logits >>> # - we define a ClassificationDecoder, which can be used to decode the
>>> # using trainable position embeddings >>> # final hidden states of the latents to classification logits
>>> config = PerceiverConfig() >>> # using trainable position embeddings
>>> preprocessor = PerceiverTextPreprocessor(config) >>> config = PerceiverConfig()
>>> decoder = PerceiverClassificationDecoder(config, >>> preprocessor = PerceiverTextPreprocessor(config)
... num_channels=config.d_latents, >>> decoder = PerceiverClassificationDecoder(config,
... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1), ... num_channels=config.d_latents,
... use_query_residual=True) ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
>>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder) ... use_query_residual=True)
>>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
>>> # you can then do a forward pass as follows:
>>> tokenizer = PerceiverTokenizer() >>> # you can then do a forward pass as follows:
>>> text = "hello world" >>> tokenizer = PerceiverTokenizer()
>>> inputs = tokenizer(text, return_tensors="pt").input_ids >>> text = "hello world"
>>> inputs = tokenizer(text, return_tensors="pt").input_ids
>>> with torch.no_grad():
>>> outputs = model(inputs=inputs) >>> with torch.no_grad():
>>> logits = outputs.logits >>> outputs = model(inputs=inputs)
>>> logits = outputs.logits
>>> # to train, one can train the model using standard cross-entropy:
>>> criterion = torch.nn.CrossEntropyLoss() >>> # to train, one can train the model using standard cross-entropy:
>>> criterion = torch.nn.CrossEntropyLoss()
>>> labels = torch.tensor([1])
>>> loss = criterion(logits, labels) >>> labels = torch.tensor([1])
>>> loss = criterion(logits, labels)
>>> # EXAMPLE 2: using the Perceiver to classify images
>>> # - we define an ImagePreprocessor, which can be used to embed images >>> # EXAMPLE 2: using the Perceiver to classify images
>>> preprocessor=PerceiverImagePreprocessor( >>> # - we define an ImagePreprocessor, which can be used to embed images
... config, >>> preprocessor=PerceiverImagePreprocessor(
... prep_type="conv1x1", ... config,
... spatial_downsample=1, ... prep_type="conv1x1",
... out_channels=256, ... spatial_downsample=1,
... position_encoding_type="trainable", ... out_channels=256,
... concat_or_add_pos="concat", ... position_encoding_type="trainable",
... project_pos_dim=256, ... concat_or_add_pos="concat",
... trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2, ... project_pos_dim=256,
... ), ... trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
... ) ... ),
... )
>>> model = PerceiverModel(
... config, >>> model = PerceiverModel(
... input_preprocessor=preprocessor, ... config,
... decoder=PerceiverClassificationDecoder( ... input_preprocessor=preprocessor,
... config, ... decoder=PerceiverClassificationDecoder(
... num_channels=config.d_latents, ... config,
... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1), ... num_channels=config.d_latents,
... use_query_residual=True, ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
... ), ... use_query_residual=True,
... ) ... ),
... )
>>> # you can then do a forward pass as follows:
>>> feature_extractor = PerceiverFeatureExtractor() >>> # you can then do a forward pass as follows:
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> feature_extractor = PerceiverFeatureExtractor()
>>> image = Image.open(requests.get(url, stream=True).raw) >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> inputs = feature_extractor(image, return_tensors="pt").pixel_values >>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
>>> with torch.no_grad():
>>> outputs = model(inputs=inputs) >>> with torch.no_grad():
>>> logits = outputs.logits >>> outputs = model(inputs=inputs)
>>> logits = outputs.logits
>>> # to train, one can train the model using standard cross-entropy:
>>> criterion = torch.nn.CrossEntropyLoss() >>> # to train, one can train the model using standard cross-entropy:
>>> criterion = torch.nn.CrossEntropyLoss()
>>> labels = torch.tensor([1])
>>> loss = criterion(logits, labels) >>> labels = torch.tensor([1])
""" >>> loss = criterion(logits, labels)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
......
...@@ -28,26 +28,29 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -28,26 +28,29 @@ class PerceiverTokenizer(PreTrainedTokenizer):
""" """
Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding. Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[BOS]"`): bos_token (`str`, *optional*, defaults to `"[BOS]"`):
The BOS token (reserved in the vocab, but not actually used). The BOS token (reserved in the vocab, but not actually used).
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[EOS]"`): eos_token (`str`, *optional*, defaults to `"[EOS]"`):
The end of sequence token (reserved in the vocab, but not actually used). The end of sequence token (reserved in the vocab, but not actually used).
.. note:: <Tip>
When building a sequence using special tokens, this is not the token that is used for the end of When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`. sequence. The token used is the `sep_token`.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
</Tip>
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The MASK token, useful for masked language modeling. The MASK token, useful for masked language modeling.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The CLS token (reserved in the vocab, but not actually used). The CLS token (reserved in the vocab, but not actually used).
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from two sequences. The separator token, which is used when building a sequence from two sequences.
""" """
...@@ -115,18 +118,18 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -115,18 +118,18 @@ class PerceiverTokenizer(PreTrainedTokenizer):
) -> List[int]: ) -> List[int]:
""" """
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method. special tokens using the tokenizer `prepare_for_model` method.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs. List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model. Whether or not the token list is already formatted with special tokens for the model.
Returns: Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
return super().get_special_tokens_mask( return super().get_special_tokens_mask(
...@@ -145,17 +148,17 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -145,17 +148,17 @@ class PerceiverTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
following format: following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: `[CLS] X [SEP]`
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added. List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......
...@@ -69,41 +69,47 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -69,41 +69,47 @@ class PhobertTokenizer(PreTrainedTokenizer):
""" """
Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding. Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (:obj:`str`): merges_file (`str`):
Path to the merges file. Path to the merges file.
bos_token (:obj:`st`, `optional`, defaults to :obj:`"<s>"`): bos_token (`st`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: <Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`. sequence. The token used is the `cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token. The end of sequence token.
.. note:: <Tip>
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the `sep_token`.
</Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sep_token (`str`, *optional*, defaults to `"</s>"`):
sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens. token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`): cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens. instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`): mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
""" """
...@@ -162,17 +168,17 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -162,17 +168,17 @@ class PhobertTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A PhoBERT sequence has the following format: adding special tokens. A PhoBERT sequence has the following format:
- single sequence: ``<s> X </s>`` - single sequence: `<s> X </s>`
- pair of sequences: ``<s> A </s></s> B </s>`` - pair of sequences: `<s> A </s></s> B </s>`
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added. List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
...@@ -186,18 +192,18 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -186,18 +192,18 @@ class PhobertTokenizer(PreTrainedTokenizer):
) -> List[int]: ) -> List[int]:
""" """
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method. special tokens using the tokenizer `prepare_for_model` method.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs. List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model. Whether or not the token list is already formatted with special tokens for the model.
Returns: Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -217,13 +223,13 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -217,13 +223,13 @@ class PhobertTokenizer(PreTrainedTokenizer):
make use of token type ids, therefore a list of zeros is returned. make use of token type ids, therefore a list of zeros is returned.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs. List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of zeros. `List[int]`: List of zeros.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
......
...@@ -28,69 +28,69 @@ PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,69 +28,69 @@ PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ProphetNetConfig(PretrainedConfig): class ProphetNetConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used
to instantiate a ProphetNet model according to the specified arguments, defining the model architecture. to instantiate a ProphetNet model according to the specified arguments, defining the model architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args: Args:
activation_dropout (:obj:`float`, `optional`, defaults to 0.1): activation_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
vocab_size (:obj:`int`, `optional`, defaults to 30522): vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`. the `inputs_ids` passed when calling [`ProphetNetModel`].
hidden_size (:obj:`int`, `optional`, defaults to 1024): hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer. Dimensionality of the layers and the pooler layer.
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
num_encoder_layers (:obj:`int`, `optional`, defaults to 12): num_encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers. Number of encoder layers.
num_encoder_attention_heads (:obj:`int`, `optional`, defaults to 16): num_encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the ``intermediate`` (often named feed-forward) layer in decoder. Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
num_decoder_layers (:obj:`int`, `optional`, defaults to 12): num_decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers. Number of decoder layers.
num_decoder_attention_heads (:obj:`int`, `optional`, defaults to 16): num_decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder. Number of attention heads for each attention layer in the Transformer decoder.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1): attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (:obj:`float`, `optional`, defaults to 0.1): dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512): max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
init_std (:obj:`float`, `optional`, defaults to 0.02): init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`True`): add_cross_attention (`bool`, *optional*, defaults to `True`):
Whether cross-attention layers should be added to the model. Whether cross-attention layers should be added to the model.
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`): is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether this is an encoder/decoder model. Whether this is an encoder/decoder model.
pad_token_id (:obj:`int`, `optional`, defaults to 1) pad_token_id (`int`, *optional*, defaults to 1)
Padding token id. Padding token id.
bos_token_id (:obj:`int`, `optional`, defaults to 0) bos_token_id (`int`, *optional*, defaults to 0)
Beginning of stream token id. Beginning of stream token id.
eos_token_id (:obj:`int`, `optional`, defaults to 2) eos_token_id (`int`, *optional*, defaults to 2)
End of stream token id. End of stream token id.
ngram (:obj:`int`, `optional`, defaults to 2) ngram (`int`, *optional*, defaults to 2)
Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
token. token.
num_buckets (:obj:`int`, `optional`, defaults to 32) num_buckets (`int`, *optional*, defaults to 32)
The number of buckets to use for each attention layer. This is for relative position calculation. See the The number of buckets to use for each attention layer. This is for relative position calculation. See the
`T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details. [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
relative_max_distance (:obj:`int`, `optional`, defaults to 128) relative_max_distance (`int`, *optional*, defaults to 128)
Relative distances greater than this number will be put into the last same bucket. This is for relative Relative distances greater than this number will be put into the last same bucket. This is for relative
position calculation. See the `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details. position calculation. See the [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
disable_ngram_loss (:obj:`bool`, `optional`, defaults to :obj:`False`): disable_ngram_loss (`bool`, *optional*, defaults to `False`):
Whether be trained predicting only the next first token. Whether be trained predicting only the next first token.
eps (:obj:`float`, `optional`, defaults to 0.0): eps (`float`, *optional*, defaults to 0.0):
Controls the ``epsilon`` parameter value for label smoothing in the loss calculation. If set to 0, no label Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
smoothing is performed. smoothing is performed.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
""" """
model_type = "prophetnet" model_type = "prophetnet"
......
...@@ -1271,18 +1271,19 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel): ...@@ -1271,18 +1271,19 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import ProphetNetTokenizer, ProphetNetEncoder ```python
>>> import torch >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
>>> import torch
>>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
>>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone') >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -1788,20 +1789,21 @@ class ProphetNetModel(ProphetNetPreTrainedModel): ...@@ -1788,20 +1789,21 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import ProphetNetTokenizer, ProphetNetModel ```python
>>> from transformers import ProphetNetTokenizer, ProphetNetModel
>>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
>>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state # main stream hidden states >>> last_hidden_states = outputs.last_hidden_state # main stream hidden states
>>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states
""" ```"""
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
......
...@@ -56,46 +56,45 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -56,46 +56,45 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
r""" r"""
Construct a ProphetNetTokenizer. Based on WordPiece. Construct a ProphetNetTokenizer. Based on WordPiece.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods. Users should refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (:obj:`str`): vocab_file (`str`):
File containing the vocabulary. File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing. Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece. Whether or not to do basic tokenization before WordPiece.
never_split (:obj:`Iterable`, `optional`): never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True` `do_basic_tokenize=True`
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens. token of a sequence built with special tokens.
x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`): x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
Special second separator token, which can be generated by Special second separator token, which can be generated by
:class:`~transformers.ProphetNetForConditionalGeneration`. It is used to separate bullet-point like [`ProphetNetForConditionalGeneration`]. It is used to separate bullet-point like
sentences in summarization, *e.g.*. sentences in summarization, *e.g.*.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens. instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this `issue This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
<https://github.com/huggingface/transformers/issues/328>`__). strip_accents: (`bool`, *optional*):
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -189,18 +188,18 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -189,18 +188,18 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
) -> List[int]: ) -> List[int]:
""" """
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method. special tokens using the tokenizer `prepare_for_model` method.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs. List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model. Whether or not the token list is already formatted with special tokens for the model.
Returns: Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
return super().get_special_tokens_mask( return super().get_special_tokens_mask(
...@@ -218,21 +217,21 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -218,21 +217,21 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
sequence pair mask has the following format: sequence pair mask has the following format:
:: ```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs. List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s). sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
...@@ -267,17 +266,17 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -267,17 +266,17 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format: adding special tokens. A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: `[CLS] X [SEP]`
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added. List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id] return token_ids_0 + [self.sep_token_id]
......
...@@ -28,60 +28,60 @@ QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,60 +28,60 @@ QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class QDQBertConfig(PretrainedConfig): class QDQBertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.QDQBertModel`. It is used to This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to
instantiate an QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a instantiate an QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased configuration with the defaults will yield a similar configuration to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
<https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args: Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522): vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.QDQBertModel`. `inputs_ids` passed when calling [`QDQBertModel`].
hidden_size (:obj:`int`, `optional`, defaults to 768): hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer. Dimension of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12): num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12): num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072): intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512): max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2): type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.QDQBertModel`. The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`].
initializer_range (:obj:`float`, `optional`, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``. relevant if `config.is_decoder=True`.
Examples:: Examples:
>>> from transformers import QDQBertModel, QDQBertConfig ```python
>>> from transformers import QDQBertModel, QDQBertConfig
>>> # Initializing a QDQBERT bert-base-uncased style configuration >>> # Initializing a QDQBERT bert-base-uncased style configuration
>>> configuration = QDQBertConfig() >>> configuration = QDQBertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration >>> # Initializing a model from the bert-base-uncased style configuration
>>> model = QDQBertModel(configuration) >>> model = QDQBertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" ```"""
model_type = "qdqbert" model_type = "qdqbert"
def __init__( def __init__(
......
...@@ -21,62 +21,62 @@ from ...file_utils import add_start_docstrings ...@@ -21,62 +21,62 @@ from ...file_utils import add_start_docstrings
RAG_CONFIG_DOC = r""" RAG_CONFIG_DOC = r"""
:class:`~transformers.RagConfig` stores the configuration of a `RagModel`. Configuration objects inherit from [`RagConfig`] stores the configuration of a *RagModel*. Configuration objects inherit from
:class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
:class:`~transformers.PretrainedConfig` for more information. [`PretrainedConfig`] for more information.
Args: Args:
title_sep (:obj:`str`, `optional`, defaults to ``" / "``): title_sep (`str`, *optional*, defaults to `" / "`):
Separator inserted between the title and the text of the retrieved document when calling Separator inserted between the title and the text of the retrieved document when calling
:class:`~transformers.RagRetriever`. [`RagRetriever`].
doc_sep (:obj:`str`, `optional`, defaults to ``" // "``): doc_sep (`str`, *optional*, defaults to `" // "`):
Separator inserted between the the text of the retrieved document and the original input when calling Separator inserted between the the text of the retrieved document and the original input when calling
:class:`~transformers.RagRetriever`. [`RagRetriever`].
n_docs (:obj:`int`, `optional`, defaults to 5): n_docs (`int`, *optional*, defaults to 5):
Number of documents to retrieve. Number of documents to retrieve.
max_combined_length (:obj:`int`, `optional`, defaults to 300): max_combined_length (`int`, *optional*, defaults to 300):
Max length of contextualized input returned by :meth:`~transformers.RagRetriever.__call__`. Max length of contextualized input returned by [`~RagRetriever.__call__`].
retrieval_vector_size (:obj:`int`, `optional`, defaults to 768): retrieval_vector_size (`int`, *optional*, defaults to 768):
Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`. Dimensionality of the document embeddings indexed by [`RagRetriever`].
retrieval_batch_size (:obj:`int`, `optional`, defaults to 8): retrieval_batch_size (`int`, *optional*, defaults to 8):
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
:class:`~transformers.RagRetriever`. [`RagRetriever`].
dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`): dataset (`str`, *optional*, defaults to `"wiki_dpr"`):
A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
using :obj:`datasets.list_datasets()`). using `datasets.list_datasets()`).
dataset_split (:obj:`str`, `optional`, defaults to :obj:`"train"`) dataset_split (`str`, *optional*, defaults to `"train"`)
Which split of the :obj:`dataset` to load. Which split of the `dataset` to load.
index_name (:obj:`str`, `optional`, defaults to :obj:`"compressed"`) index_name (`str`, *optional*, defaults to `"compressed"`)
The index name of the index associated with the :obj:`dataset`. One can choose between :obj:`"legacy"`, The index name of the index associated with the `dataset`. One can choose between `"legacy"`,
:obj:`"exact"` and :obj:`"compressed"`. `"exact"` and `"compressed"`.
index_path (:obj:`str`, `optional`) index_path (`str`, *optional*)
The path to the serialized faiss index on disk. The path to the serialized faiss index on disk.
passages_path: (:obj:`str`, `optional`): passages_path: (`str`, *optional*):
A path to text passages compatible with the faiss index. Required if using A path to text passages compatible with the faiss index. Required if using
:class:`~transformers.models.rag.retrieval_rag.LegacyIndex` [`~models.rag.retrieval_rag.LegacyIndex`]
use_dummy_dataset (:obj:`bool`, `optional`, defaults to ``False``) use_dummy_dataset (`bool`, *optional*, defaults to `False`)
Whether to load a "dummy" variant of the dataset specified by :obj:`dataset`. Whether to load a "dummy" variant of the dataset specified by `dataset`.
label_smoothing (:obj:`float`, `optional`, defaults to 0.0): label_smoothing (`float`, *optional*, defaults to 0.0):
Only relevant if ``return_loss`` is set to :obj:`True`. Controls the ``epsilon`` parameter value for label Only relevant if `return_loss` is set to `True`. Controls the `epsilon` parameter value for label
smoothing in the loss calculation. If set to 0, no label smoothing is performed. smoothing in the loss calculation. If set to 0, no label smoothing is performed.
do_marginalize (:obj:`bool`, `optional`, defaults to :obj:`False`): do_marginalize (`bool`, *optional*, defaults to `False`):
If :obj:`True`, the logits are marginalized over all documents by making use of If `True`, the logits are marginalized over all documents by making use of
``torch.nn.functional.log_softmax``. `torch.nn.functional.log_softmax`.
reduce_loss (:obj:`bool`, `optional`, defaults to :obj:`False`): reduce_loss (`bool`, *optional*, defaults to `False`):
Whether or not to reduce the NLL loss using the ``torch.Tensor.sum`` operation. Whether or not to reduce the NLL loss using the `torch.Tensor.sum` operation.
do_deduplication (:obj:`bool`, `optional`, defaults to :obj:`True`): do_deduplication (`bool`, *optional*, defaults to `True`):
Whether or not to deduplicate the generations from different context documents for a given input. Has to be Whether or not to deduplicate the generations from different context documents for a given input. Has to be
set to :obj:`False` if used while training with distributed backend. set to `False` if used while training with distributed backend.
exclude_bos_score (:obj:`bool`, `optional`, defaults to :obj:`False`): exclude_bos_score (`bool`, *optional*, defaults to `False`):
Whether or not to disregard the BOS token when computing the loss. Whether or not to disregard the BOS token when computing the loss.
output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`): output_retrieved(`bool`, *optional*, defaults to `False`):
If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and If set to `True`, `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
:obj:`context_attention_mask` are returned. See returned tensors for more detail. `context_attention_mask` are returned. See returned tensors for more detail.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
forced_eos_token_id (:obj:`int`, `optional`): forced_eos_token_id (`int`, *optional*):
The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to The id of the token to force as the last generated token when `max_length` is reached. Usually set to
:obj:`eos_token_id`. `eos_token_id`.
""" """
...@@ -174,21 +174,21 @@ class RagConfig(PretrainedConfig): ...@@ -174,21 +174,21 @@ class RagConfig(PretrainedConfig):
cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
) -> PretrainedConfig: ) -> PretrainedConfig:
r""" r"""
Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
configuration and decoder model configuration. configuration and decoder model configuration.
Returns: Returns:
:class:`EncoderDecoderConfig`: An instance of a configuration object [`EncoderDecoderConfig`]: An instance of a configuration object
""" """
return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs) return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
def to_dict(self): def to_dict(self):
""" """
Serializes this instance to a Python dictionary. Override the default Serializes this instance to a Python dictionary. Override the default
:meth:`~transformers.PretrainedConfig.to_dict`. [`~PretrainedConfig.to_dict`].
Returns: Returns:
:obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
output["question_encoder"] = self.question_encoder.to_dict() output["question_encoder"] = self.question_encoder.to_dict()
......
...@@ -544,19 +544,20 @@ class RagModel(RagPreTrainedModel): ...@@ -544,19 +544,20 @@ class RagModel(RagPreTrainedModel):
r""" r"""
Returns: Returns:
Example:: Example:
>>> from transformers import RagTokenizer, RagRetriever, RagModel ```python
>>> import torch >>> from transformers import RagTokenizer, RagRetriever, RagModel
>>> import torch
>>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base") >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
>>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True) >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
>>> # initialize with RagRetriever to do everything in one forward call >>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever) >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt") >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"]) >>> outputs = model(input_ids=inputs["input_ids"])
""" ```"""
n_docs = n_docs if n_docs is not None else self.config.n_docs n_docs = n_docs if n_docs is not None else self.config.n_docs
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment