Unverified Commit e68ec18c authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

Docs: formatting nits (#32247)



* doc formatting nits

* ignore non-autodocs

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/esm/modeling_esm.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/esm/modeling_esm.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make fixup

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 2fbbcf50
...@@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel): ...@@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
...@@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig): ...@@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information. documentation from [`PretrainedConfig`] for more information.
Args: Args:
additional_vocab_size (`int`, *optional`, defaults to 0): additional_vocab_size (`int`, *optional*, defaults to 0):
Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
are always trainable whereas regular vocab tokens can be frozen or not. are always trainable whereas regular vocab tokens can be frozen or not.
vocab_size (`int`, *optional*, defaults to 32000): vocab_size (`int`, *optional*, defaults to 32000):
......
...@@ -97,7 +97,7 @@ def load_balancing_loss_func( ...@@ -97,7 +97,7 @@ def load_balancing_loss_func(
router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts]. shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None): attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function The attention_mask used in forward function
shape [batch_size X sequence_length] if not None. shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*): num_experts (`int`, *optional*):
......
...@@ -69,7 +69,7 @@ def load_balancing_loss_func( ...@@ -69,7 +69,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts]. shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None): attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function The attention_mask used in forward function
shape [batch_size X sequence_length] if not None. shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*): num_experts (`int`, *optional*):
......
...@@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin): ...@@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin):
Args: Args:
bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*): bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
The bounding bboxes associated to `texts`. The bounding bboxes associated to `texts`.
num_image_tokens (`int`, defaults to 64): num_image_tokens (`int`, *optional* defaults to 64):
The number of (consecutive) places that are used to mark the placeholders to store image information. The number of (consecutive) places that are used to mark the placeholders to store image information.
This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using. This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
first_image_token_id (`int`, *optional*): first_image_token_id (`int`, *optional*):
......
...@@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): ...@@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
Calculate the number of patches after the preprocessing for images of any resolution. Calculate the number of patches after the preprocessing for images of any resolution.
Args: Args:
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]): image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
The size of the input image in the format (height, width). ? The size of the input image in the format (height, width). ?
grid_pinpoints (`List`): grid_pinpoints (`List`):
A list containing possible resolutions. Each item in the list should be a tuple or list A list containing possible resolutions. Each item in the list should be a tuple or list
......
...@@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): ...@@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
Calculate the number of patches after the preprocessing for images of any resolution. Calculate the number of patches after the preprocessing for images of any resolution.
Args: Args:
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]): image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
The size of the input image in the format (height, width). ? The size of the input image in the format (height, width). ?
grid_pinpoints (`List`): grid_pinpoints (`List`):
A list containing possible resolutions. Each item in the list should be a tuple or list A list containing possible resolutions. Each item in the list should be a tuple or list
......
...@@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel): ...@@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
Returns: Returns:
......
...@@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module): ...@@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`): encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the Sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross(masked)-attention of the decoder. cross(masked)-attention of the decoder.
feature_size_list (`List[torch.Size]` ): feature_size_list (`List[torch.Size]`):
This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder. This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under Whether or not to return the attentions tensors of all attention layers. See `attentions` under
......
...@@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel): ...@@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A, - 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence. - 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
Returns: Returns:
......
...@@ -84,7 +84,7 @@ def load_balancing_loss_func( ...@@ -84,7 +84,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts]. shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None): attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function The attention_mask used in forward function
shape [batch_size X sequence_length] if not None. shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*): num_experts (`int`, *optional*):
......
...@@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r""" ...@@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r"""
it will evenly distribute blocks across all devices. it will evenly distribute blocks across all devices.
Args: Args:
device_map (`Dict[int, list]`, optional, defaults to None): device_map (`Dict[int, list]`, *optional*):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
......
...@@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
Args: Args:
outputs ([`OneFormerForUniversalSegmentationOutput`]): outputs ([`OneFormerForUniversalSegmentationOutput`]):
The outputs from [`OneFormerForUniversalSegmentationOutput`]. The outputs from [`OneFormerForUniversalSegmentationOutput`].
task_type (`str`, *optional)*, defaults to "instance"): task_type (`str`, *optional*, defaults to "instance"):
The post processing depends on the task token input. If the `task_type` is "panoptic", we need to The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
ignore the stuff predictions. ignore the stuff predictions.
is_demo (`bool`, *optional)*, defaults to `True`): is_demo (`bool`, *optional)*, defaults to `True`):
......
...@@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape): ...@@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape):
channels is preserved. channels is preserved.
Returns Returns
image (`np.ndarray): image (`np.ndarray`):
The input image, but with additional singleton dimensions appended in the case where `len(output_shape) > The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
input.ndim`. input.ndim`.
output_shape (`Tuple`): output_shape (`Tuple`):
......
...@@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module): ...@@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module):
"""Normalization block """Normalization block
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module): ...@@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
"""This module mixes the features in the channel dimension. """This module mixes the features in the channel dimension.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module): ...@@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module):
"""This module mixes the patch dimension. """This module mixes the patch dimension.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module): ...@@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module):
"""This module mixes the hidden feature dimension. """This module mixes the hidden feature dimension.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module): ...@@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module):
The `PatchTSMixer` layer that does all three kinds of mixing. The `PatchTSMixer` layer that does all three kinds of mixing.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module): ...@@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module):
"""The main computing framework of the `PatchTSMixer` model. """The main computing framework of the `PatchTSMixer` model.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module): ...@@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module):
"""Prediction Head for Forecasting """Prediction Head for Forecasting
Args: Args:
config (`PatchTSMixerConfig`, *required*): Configuration. config (`PatchTSMixerConfig`):
Configuration.
""" """
def __init__(self, config: PatchTSMixerConfig, distribution_output=None): def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
...@@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module): ...@@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module):
"""Linear head for Classification and Regression. """Linear head for Classification and Regression.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration.
""" """
def __init__(self, config: PatchTSMixerConfig, distribution_output=None): def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
...@@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module): ...@@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module):
"""Pretraining head. """Pretraining head.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel): ...@@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings. Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
""" """
...@@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel): ...@@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for mask pretraining. `PatchTSMixer` for mask pretraining.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
Returns: Returns:
...@@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel): ...@@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for forecasting application. `PatchTSMixer` for forecasting application.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
Returns: Returns:
...@@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel): ...@@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for classification application. `PatchTSMixer` for classification application.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
Returns: Returns:
...@@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel): ...@@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for regression application. `PatchTSMixer` for regression application.
Args: Args:
config (`PatchTSMixerConfig`, *required*): config (`PatchTSMixerConfig`):
Configuration. Configuration.
Returns: Returns:
......
...@@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module): ...@@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module):
storage as `fused_qkv` storage as `fused_qkv`
Args: Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim] fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns: Returns:
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
......
...@@ -75,7 +75,7 @@ def load_balancing_loss_func( ...@@ -75,7 +75,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts]. shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None): attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function The attention_mask used in forward function
shape [batch_size X sequence_length] if not None. shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*): num_experts (`int`, *optional*):
......
...@@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel): ...@@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
reduce_loss (`bool`, *optional*): reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum` Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
operation. operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function. Legacy dictionary, which is required so that model can use *generate()* function.
Returns: Returns:
...@@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
reduce_loss (`bool`, *optional*): reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum` Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
operation. operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function. Legacy dictionary, which is required so that model can use *generate()* function.
Returns: Returns:
......
...@@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss ...@@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
reduce_loss (`bool`, *optional*): reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum` Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation. operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function. Legacy dictionary, which is required so that model can use *generate()* function.
Returns: Returns:
...@@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL ...@@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
reduce_loss (`bool`, *optional*): reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum` Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation. operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function. Legacy dictionary, which is required so that model can use *generate()* function.
Returns: Returns:
......
...@@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel): ...@@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*): kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment