Unverified Commit d28e647f authored by Jiahui Wei's avatar Jiahui Wei Committed by GitHub
Browse files

Fix mismatched ` in doc & other common typos (#31516)



fix common doc typos
Co-authored-by: default avatarJiahui Wei <jiahui.wei@tusen.ai>
parent 6d430616
......@@ -307,9 +307,9 @@ class AlignConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`AlignVisionConfig`].
projection_dim (`int`, *optional*, defaults to 640):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
temperature_init_value (`float`, *optional*, defaults to 1.0):
The inital value of the *temperature* paramter. Default is used as per the original ALIGN implementation.
The initial value of the *temperature* parameter. Default is used as per the original ALIGN implementation.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
kwargs (*optional*):
......
......@@ -80,7 +80,7 @@ class AltCLIPTextConfig(PretrainedConfig):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
project_dim (`int`, *optional*, defaults to 768):
The dimentions of the teacher model before the mapping layer.
The dimensions of the teacher model before the mapping layer.
Examples:
......@@ -159,7 +159,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
......@@ -172,7 +172,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -268,9 +268,9 @@ class AltCLIPConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`AltCLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 768):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
......@@ -333,7 +333,7 @@ class AltCLIPConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The "
f'value `text_config["{key}"]` will be overriden.'
f'value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -365,7 +365,7 @@ class AltCLIPConfig(PretrainedConfig):
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `AltCLIPVisionConfig`. "
f'The value `vision_config["{key}"]` will be overriden.'
f'The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -54,7 +54,7 @@ class BlipTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
......@@ -191,7 +191,7 @@ class BlipVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -280,11 +280,11 @@ class BlipConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`BlipVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation.
image_text_hidden_size (`int`, *optional*, defaults to 256):
Dimentionality of the hidden state of the image-text fusion layer.
Dimensionality of the hidden state of the image-text fusion layer.
label_smoothing (float, optional, *optional*, defaults to 0.0):
A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
become a mixture of the original ground truth and a uniform distribution as described in
......
......@@ -51,7 +51,7 @@ class Blip2VisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
`"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
to 1e-5): The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
......
......@@ -177,7 +177,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
......@@ -190,7 +190,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -285,9 +285,9 @@ class ChineseCLIPConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`ChineseCLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original ChineseCLIP
The initial value of the *logit_scale* parameter. Default is used as per the original ChineseCLIP
implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
......@@ -351,7 +351,7 @@ class ChineseCLIPConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `ChineseCLIPTextConfig`. "
f'The value `text_config["{key}"]` will be overriden.'
f'The value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -383,7 +383,7 @@ class ChineseCLIPConfig(PretrainedConfig):
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize "
f'`ChineseCLIPVisionConfig`. The value `vision_config["{key}"]` will be overriden.'
f'`ChineseCLIPVisionConfig`. The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -342,9 +342,9 @@ class ClapConfig(PretrainedConfig):
audio_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`ClapAudioConfig`].
logit_scale_init_value (`float`, *optional*, defaults to 14.29):
The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original CLAP implementation.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and audio projection layers.
Dimensionality of text and audio projection layers.
projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
Activation function for the projection layers.
initializer_factor (`float`, *optional*, defaults to 1.0):
......
......@@ -50,7 +50,7 @@ class CLIPTextConfig(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
......@@ -165,7 +165,7 @@ class CLIPVisionConfig(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
......@@ -178,7 +178,7 @@ class CLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -274,9 +274,9 @@ class CLIPConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
......@@ -340,7 +340,7 @@ class CLIPConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
f'value `text_config["{key}"]` will be overriden.'
f'value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -372,7 +372,7 @@ class CLIPConfig(PretrainedConfig):
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
f'The value `vision_config["{key}"]` will be overriden.'
f'The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -51,7 +51,7 @@ class CLIPSegTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -163,7 +163,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -259,7 +259,7 @@ class CLIPSegConfig(PretrainedConfig):
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original CLIPSeg implementation.
extract_layers (`List[int]`, *optional*, defaults to `[3, 6, 9]`):
Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
reduce_dim (`int`, *optional*, defaults to 64):
......@@ -270,7 +270,7 @@ class CLIPSegConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
conditional_layer (`int`, *optional*, defaults to 0):
......@@ -354,7 +354,7 @@ class CLIPSegConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `CLIPSegTextConfig`. The "
f'value `text_config["{key}"]` will be overriden.'
f'value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -386,7 +386,7 @@ class CLIPSegConfig(PretrainedConfig):
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `CLIPSegVisionConfig`. "
f'The value `vision_config["{key}"]` will be overriden.'
f'The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -351,9 +351,9 @@ class ClvpConfig(PretrainedConfig):
decoder_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`ClvpDecoderConfig`].
projection_dim (`int`, *optional*, defaults to 768):
Dimentionality of text and speech projection layers.
Dimensionality of text and speech projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLVP implementation.
The initial value of the *logit_scale* parameter. Default is used as per the original CLVP implementation.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
testing).
......
......@@ -483,9 +483,9 @@ class FlavaConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and image projection layers.
Dimensionality of text and image projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original FLAVA/CLIP
The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP
implementation.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
......@@ -590,7 +590,7 @@ class FlavaConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The "
f'value `text_config["{key}"]` will be overriden.'
f'value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -622,7 +622,7 @@ class FlavaConfig(PretrainedConfig):
else:
message = (
f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. "
f'The value `image_config["{key}"]` will be overriden.'
f'The value `image_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -654,7 +654,7 @@ class FlavaConfig(PretrainedConfig):
else:
message = (
f"`multimodal_config_dict` is provided which will be used to initialize "
f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overriden.'
f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -687,7 +687,7 @@ class FlavaConfig(PretrainedConfig):
else:
message = (
f"`image_codebook_config_dict` is provided which will be used to initialize "
f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overriden.'
f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -48,7 +48,7 @@ class GitVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......
......@@ -58,7 +58,7 @@ class GroupViTTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -169,7 +169,7 @@ class GroupViTVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
......@@ -281,11 +281,11 @@ class GroupViTConfig(PretrainedConfig):
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`GroupViTVisionConfig`].
projection_dim (`int`, *optional*, defaults to 256):
Dimentionality of text and vision projection layers.
Dimensionality of text and vision projection layers.
projection_intermediate_dim (`int`, *optional*, defaults to 4096):
Dimentionality of intermediate layer of text and vision projection layers.
Dimensionality of intermediate layer of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* parameter. Default is used as per the original GroupViT
The initial value of the *logit_scale* parameter. Default is used as per the original GroupViT
implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
......@@ -333,7 +333,7 @@ class GroupViTConfig(PretrainedConfig):
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. "
f'The value `text_config["{key}"]` will be overriden.'
f'The value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
......@@ -365,7 +365,7 @@ class GroupViTConfig(PretrainedConfig):
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`."
f' The value `vision_config["{key}"]` will be overriden.'
f' The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
......
......@@ -54,7 +54,7 @@ class IdeficsVisionConfig(PretrainedConfig):
Number of image channels.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......
......@@ -52,7 +52,7 @@ class Idefics2VisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......
......@@ -51,7 +51,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer
`"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. to 1e-5): The epsilon used by the layer
normalization layers.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
......
......@@ -170,7 +170,7 @@ class Kosmos2VisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......
......@@ -57,7 +57,7 @@ class Owlv2TextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -170,7 +170,7 @@ class Owlv2VisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -267,7 +267,7 @@ class Owlv2Config(PretrainedConfig):
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* parameter. Default is used as per the original OWLv2
The initial value of the *logit_scale* parameter. Default is used as per the original OWLv2
implementation.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not the model should return a dictionary. If `False`, returns a tuple.
......
......@@ -59,7 +59,7 @@ class OwlViTTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -171,7 +171,7 @@ class OwlViTVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -267,7 +267,7 @@ class OwlViTConfig(PretrainedConfig):
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
implementation.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not the model should return a dictionary. If `False`, returns a tuple.
......
......@@ -189,7 +189,7 @@ class Pix2StructVisionConfig(PretrainedConfig):
Number of attention heads for each attention layer in the Transformer encoder.
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
dropout_rate (`float`, *optional*, defaults to 0.0):
......
......@@ -156,7 +156,7 @@ class SiglipVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment