Unverified Commit 98d40fed authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Cleanup the usage of `layer_norm_eps` in some models (#21336)



* fix

* fix

* make style

* For CLIP

* For OwlViT

* For XCLIP

* For CLIPSeg

* For GroupViT

* fix docstrings

* fix docstrings

* For AltCLIP

* For ChineseCLIP

* For Blip

* For GiT

* make style

* update

* update

* update

* fix

* fix

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 623346ab
...@@ -173,8 +173,9 @@ class AltCLIPVisionConfig(PretrainedConfig): ...@@ -173,8 +173,9 @@ class AltCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -213,7 +214,7 @@ class AltCLIPVisionConfig(PretrainedConfig): ...@@ -213,7 +214,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -844,9 +844,9 @@ class AltCLIPEncoderLayer(nn.Module): ...@@ -844,9 +844,9 @@ class AltCLIPEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = AltCLIPAttention(config) self.self_attn = AltCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = AltCLIPMLP(config) self.mlp = AltCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -1099,9 +1099,9 @@ class AltCLIPVisionTransformer(nn.Module): ...@@ -1099,9 +1099,9 @@ class AltCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = AltCLIPVisionEmbeddings(config) self.embeddings = AltCLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = AltCLIPEncoder(config) self.encoder = AltCLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
......
...@@ -74,8 +74,9 @@ class BlipTextConfig(PretrainedConfig): ...@@ -74,8 +74,9 @@ class BlipTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
...@@ -207,8 +208,9 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -207,8 +208,9 @@ class BlipVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -247,7 +249,7 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -247,7 +249,7 @@ class BlipVisionConfig(PretrainedConfig):
image_size=384, image_size=384,
patch_size=16, patch_size=16,
hidden_act="gelu", hidden_act="gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=1e-10, initializer_range=1e-10,
......
...@@ -374,9 +374,9 @@ class BlipEncoderLayer(nn.Module): ...@@ -374,9 +374,9 @@ class BlipEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = BlipAttention(config) self.self_attn = BlipAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = BlipMLP(config) self.mlp = BlipMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -665,7 +665,7 @@ class BlipVisionModel(BlipPreTrainedModel): ...@@ -665,7 +665,7 @@ class BlipVisionModel(BlipPreTrainedModel):
self.embeddings = BlipVisionEmbeddings(config) self.embeddings = BlipVisionEmbeddings(config)
self.encoder = BlipEncoder(config) self.encoder = BlipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.post_init() self.post_init()
......
...@@ -187,8 +187,9 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ...@@ -187,8 +187,9 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -225,7 +226,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ...@@ -225,7 +226,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -626,9 +626,9 @@ class ChineseCLIPVisionLayer(nn.Module): ...@@ -626,9 +626,9 @@ class ChineseCLIPVisionLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = ChineseCLIPVisionAttention(config) self.self_attn = ChineseCLIPVisionAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = ChineseCLIPVisionMLP(config) self.mlp = ChineseCLIPVisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -1054,9 +1054,9 @@ class ChineseCLIPVisionTransformer(nn.Module): ...@@ -1054,9 +1054,9 @@ class ChineseCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = ChineseCLIPVisionEmbeddings(config) self.embeddings = ChineseCLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = ChineseCLIPVisionEncoder(config) self.encoder = ChineseCLIPVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)
......
...@@ -64,8 +64,9 @@ class CLIPTextConfig(PretrainedConfig): ...@@ -64,8 +64,9 @@ class CLIPTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
...@@ -102,7 +103,7 @@ class CLIPTextConfig(PretrainedConfig): ...@@ -102,7 +103,7 @@ class CLIPTextConfig(PretrainedConfig):
num_attention_heads=8, num_attention_heads=8,
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
...@@ -171,8 +172,9 @@ class CLIPVisionConfig(PretrainedConfig): ...@@ -171,8 +172,9 @@ class CLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -211,7 +213,7 @@ class CLIPVisionConfig(PretrainedConfig): ...@@ -211,7 +213,7 @@ class CLIPVisionConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -356,9 +356,9 @@ class CLIPEncoderLayer(nn.Module): ...@@ -356,9 +356,9 @@ class CLIPEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = CLIPAttention(config) self.self_attn = CLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPMLP(config) self.mlp = CLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -680,7 +680,7 @@ class CLIPTextTransformer(nn.Module): ...@@ -680,7 +680,7 @@ class CLIPTextTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = CLIPTextEmbeddings(config) self.embeddings = CLIPTextEmbeddings(config)
self.encoder = CLIPEncoder(config) self.encoder = CLIPEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
...@@ -830,9 +830,9 @@ class CLIPVisionTransformer(nn.Module): ...@@ -830,9 +830,9 @@ class CLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = CLIPVisionEmbeddings(config) self.embeddings = CLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPEncoder(config) self.encoder = CLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
......
...@@ -56,8 +56,9 @@ class CLIPSegTextConfig(PretrainedConfig): ...@@ -56,8 +56,9 @@ class CLIPSegTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
...@@ -93,7 +94,7 @@ class CLIPSegTextConfig(PretrainedConfig): ...@@ -93,7 +94,7 @@ class CLIPSegTextConfig(PretrainedConfig):
num_attention_heads=8, num_attention_heads=8,
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
...@@ -161,8 +162,9 @@ class CLIPSegVisionConfig(PretrainedConfig): ...@@ -161,8 +162,9 @@ class CLIPSegVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -200,7 +202,7 @@ class CLIPSegVisionConfig(PretrainedConfig): ...@@ -200,7 +202,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
...@@ -270,8 +272,7 @@ class CLIPSegConfig(PretrainedConfig): ...@@ -270,8 +272,7 @@ class CLIPSegConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers.
decoder_intermediate_size (`int`, *optional*, defaults to 2048): decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder. Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
conditional_layer (`int`, *optional*, defaults to 0): conditional_layer (`int`, *optional*, defaults to 0):
......
...@@ -379,9 +379,9 @@ class CLIPSegEncoderLayer(nn.Module): ...@@ -379,9 +379,9 @@ class CLIPSegEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config) self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config) self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -691,7 +691,7 @@ class CLIPSegTextTransformer(nn.Module): ...@@ -691,7 +691,7 @@ class CLIPSegTextTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = CLIPSegTextEmbeddings(config) self.embeddings = CLIPSegTextEmbeddings(config)
self.encoder = CLIPSegEncoder(config) self.encoder = CLIPSegEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
...@@ -837,9 +837,9 @@ class CLIPSegVisionTransformer(nn.Module): ...@@ -837,9 +837,9 @@ class CLIPSegVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = CLIPSegVisionEmbeddings(config) self.embeddings = CLIPSegVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPSegEncoder(config) self.encoder = CLIPSegEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
...@@ -1178,9 +1178,9 @@ class CLIPSegDecoderLayer(nn.Module): ...@@ -1178,9 +1178,9 @@ class CLIPSegDecoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config) self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config) self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
......
...@@ -54,8 +54,9 @@ class GitVisionConfig(PretrainedConfig): ...@@ -54,8 +54,9 @@ class GitVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -94,7 +95,7 @@ class GitVisionConfig(PretrainedConfig): ...@@ -94,7 +95,7 @@ class GitVisionConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=16, patch_size=16,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -762,9 +762,9 @@ class GitVisionEncoderLayer(nn.Module): ...@@ -762,9 +762,9 @@ class GitVisionEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = GitVisionAttention(config) self.self_attn = GitVisionAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GitVisionMLP(config) self.mlp = GitVisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -935,9 +935,9 @@ class GitVisionTransformer(nn.Module): ...@@ -935,9 +935,9 @@ class GitVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = GitVisionEmbeddings(config) self.embeddings = GitVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = GitVisionEncoder(config) self.encoder = GitVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig)
...@@ -1048,7 +1048,8 @@ class GitProjection(nn.Module): ...@@ -1048,7 +1048,8 @@ class GitProjection(nn.Module):
super().__init__() super().__init__()
self.config = config self.config = config
self.visual_projection = nn.Sequential( self.visual_projection = nn.Sequential(
nn.Linear(config.vision_config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size) nn.Linear(config.vision_config.hidden_size, config.hidden_size),
nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
) )
def forward(self, embeddings: torch.Tensor) -> torch.Tensor: def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
......
...@@ -100,7 +100,7 @@ class GroupViTTextConfig(PretrainedConfig): ...@@ -100,7 +100,7 @@ class GroupViTTextConfig(PretrainedConfig):
num_attention_heads=4, num_attention_heads=4,
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -714,9 +714,9 @@ class GroupViTEncoderLayer(nn.Module): ...@@ -714,9 +714,9 @@ class GroupViTEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = GroupViTAttention(config) self.self_attn = GroupViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GroupViTMLP(config) self.mlp = GroupViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -1076,7 +1076,7 @@ class GroupViTTextTransformer(nn.Module): ...@@ -1076,7 +1076,7 @@ class GroupViTTextTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = GroupViTTextEmbeddings(config) self.embeddings = GroupViTTextEmbeddings(config)
self.encoder = GroupViTTextEncoder(config) self.encoder = GroupViTTextEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
...@@ -1219,7 +1219,7 @@ class GroupViTVisionTransformer(nn.Module): ...@@ -1219,7 +1219,7 @@ class GroupViTVisionTransformer(nn.Module):
self.embeddings = GroupViTVisionEmbeddings(config) self.embeddings = GroupViTVisionEmbeddings(config)
self.encoder = GroupViTVisionEncoder(config) self.encoder = GroupViTVisionEncoder(config)
self.layernorm = nn.LayerNorm(embed_dim) self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
......
...@@ -1063,13 +1063,13 @@ class OneFormerPixelDecoderEncoderLayer(nn.Module): ...@@ -1063,13 +1063,13 @@ class OneFormerPixelDecoderEncoderLayer(nn.Module):
n_points=4, n_points=4,
) )
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.dropout = config.dropout self.dropout = config.dropout
self.activation_fn = nn.functional.relu self.activation_fn = nn.functional.relu
self.activation_dropout = config.dropout self.activation_dropout = config.dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim) self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim) self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.is_training = config.is_training self.is_training = config.is_training
...@@ -1634,11 +1634,13 @@ class OneFormerAttention(nn.Module): ...@@ -1634,11 +1634,13 @@ class OneFormerAttention(nn.Module):
class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module): class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False): def __init__(
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
):
super().__init__() super().__init__()
self.self_attn = OneFormerAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, is_decoder=True) self.self_attn = OneFormerAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, is_decoder=True)
self.norm = nn.LayerNorm(embed_dim) self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.activation = ACT2FN[activation] self.activation = ACT2FN[activation]
...@@ -1690,11 +1692,13 @@ class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module): ...@@ -1690,11 +1692,13 @@ class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module): class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False): def __init__(
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
):
super().__init__() super().__init__()
self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
self.norm = nn.LayerNorm(embed_dim) self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.activation = ACT2FN[activation] self.activation = ACT2FN[activation]
...@@ -1760,14 +1764,22 @@ class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module): ...@@ -1760,14 +1764,22 @@ class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
class OneFormerTransformerDecoderFFNLayer(nn.Module): class OneFormerTransformerDecoderFFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): def __init__(
self,
d_model,
dim_feedforward=2048,
dropout=0.0,
activation="relu",
normalize_before=False,
layer_norm_eps=1e-05,
):
super().__init__() super().__init__()
# Implementation of Feedforward model # Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward) self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model) self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model) self.norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.activation = ACT2FN[activation] self.activation = ACT2FN[activation]
self.normalize_before = normalize_before self.normalize_before = normalize_before
...@@ -1836,6 +1848,7 @@ class OneFormerTransformerDecoderLayer(nn.Module): ...@@ -1836,6 +1848,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
dropout=0.0, dropout=0.0,
normalize_before=config.pre_norm, normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
) )
self.self_attn = OneFormerTransformerDecoderSelfAttentionLayer( self.self_attn = OneFormerTransformerDecoderSelfAttentionLayer(
...@@ -1843,6 +1856,7 @@ class OneFormerTransformerDecoderLayer(nn.Module): ...@@ -1843,6 +1856,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
dropout=0.0, dropout=0.0,
normalize_before=config.pre_norm, normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
) )
self.ffn = OneFormerTransformerDecoderFFNLayer( self.ffn = OneFormerTransformerDecoderFFNLayer(
...@@ -1850,6 +1864,7 @@ class OneFormerTransformerDecoderLayer(nn.Module): ...@@ -1850,6 +1864,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
dim_feedforward=config.dim_feedforward, dim_feedforward=config.dim_feedforward,
dropout=0.0, dropout=0.0,
normalize_before=config.pre_norm, normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
) )
def forward( def forward(
...@@ -1965,6 +1980,7 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module): ...@@ -1965,6 +1980,7 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
dropout=0.1, dropout=0.1,
activation="relu", activation="relu",
normalize_before=False, normalize_before=False,
layer_norm_eps=1e-05,
): ):
super().__init__() super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
...@@ -1974,9 +1990,9 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module): ...@@ -1974,9 +1990,9 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model) self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model) self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm2 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm3 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.dropout1 = nn.Dropout(dropout) self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout)
...@@ -2094,13 +2110,14 @@ class OneFormerTransformerDecoderQueryTransformer(nn.Module): ...@@ -2094,13 +2110,14 @@ class OneFormerTransformerDecoderQueryTransformer(nn.Module):
activation="relu", activation="relu",
normalize_before=False, normalize_before=False,
return_intermediate_dec=False, return_intermediate_dec=False,
layer_norm_eps=1e-05,
): ):
super().__init__() super().__init__()
decoder_layer = OneFormerTransformerDecoderQueryTransformerDecoderLayer( decoder_layer = OneFormerTransformerDecoderQueryTransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before d_model, nhead, dim_feedforward, dropout, activation, normalize_before, layer_norm_eps
) )
decoder_norm = nn.LayerNorm(d_model) decoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.decoder = OneFormerTransformerDecoderQueryTransformerDecoder( self.decoder = OneFormerTransformerDecoderQueryTransformerDecoder(
decoder_layer, decoder_layer,
num_decoder_layers, num_decoder_layers,
...@@ -2151,9 +2168,10 @@ class OneFormerTransformerDecoder(nn.Module): ...@@ -2151,9 +2168,10 @@ class OneFormerTransformerDecoder(nn.Module):
num_decoder_layers=config.query_dec_layers, num_decoder_layers=config.query_dec_layers,
normalize_before=config.pre_norm, normalize_before=config.pre_norm,
return_intermediate_dec=False, return_intermediate_dec=False,
layer_norm_eps=config.layer_norm_eps,
) )
self.decoder_norm = nn.LayerNorm(config.hidden_dim) self.decoder_norm = nn.LayerNorm(config.hidden_dim, eps=config.layer_norm_eps)
self.num_feature_levels = 3 self.num_feature_levels = 3
...@@ -2456,14 +2474,15 @@ class OneFormerTextTransformerDecoderLayer(nn.Module): ...@@ -2456,14 +2474,15 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
d_model, d_model,
nhead, nhead,
dropout=0.1, dropout=0.1,
layer_norm_eps=1e-05,
): ):
super().__init__() super().__init__()
self.self_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout) self.self_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
self.cross_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout) self.cross_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
self.norm1 = nn.LayerNorm(d_model) self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm2 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm3 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.mlp = nn.Sequential( self.mlp = nn.Sequential(
...@@ -2481,29 +2500,38 @@ class OneFormerTextTransformerDecoderLayer(nn.Module): ...@@ -2481,29 +2500,38 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
class OneFormerTextContextDecoder(nn.Module): class OneFormerTextContextDecoder(nn.Module):
def __init__( def __init__(
self, transformer_width=256, transformer_heads=4, transformer_layers=6, visual_dim=1024, dropout=0.1, **kwargs self,
transformer_width=256,
transformer_heads=4,
transformer_layers=6,
visual_dim=1024,
dropout=0.1,
layer_norm_eps=1e-05,
**kwargs
): ):
super().__init__() super().__init__()
self.memory_proj = nn.Sequential( self.memory_proj = nn.Sequential(
nn.LayerNorm(visual_dim), nn.LayerNorm(visual_dim, eps=layer_norm_eps),
nn.Linear(visual_dim, transformer_width), nn.Linear(visual_dim, transformer_width),
nn.LayerNorm(transformer_width), nn.LayerNorm(transformer_width, eps=layer_norm_eps),
) )
self.text_proj = nn.Sequential( self.text_proj = nn.Sequential(
nn.LayerNorm(visual_dim), nn.LayerNorm(visual_dim, eps=layer_norm_eps),
nn.Linear(visual_dim, transformer_width), nn.Linear(visual_dim, transformer_width),
) )
self.decoder = nn.ModuleList( self.decoder = nn.ModuleList(
[ [
OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout) OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout, layer_norm_eps)
for _ in range(transformer_layers) for _ in range(transformer_layers)
] ]
) )
self.out_proj = nn.Sequential(nn.LayerNorm(transformer_width), nn.Linear(transformer_width, visual_dim)) self.out_proj = nn.Sequential(
nn.LayerNorm(transformer_width, eps=layer_norm_eps), nn.Linear(transformer_width, visual_dim)
)
def forward(self, text, visual): def forward(self, text, visual):
visual = self.memory_proj(visual) visual = self.memory_proj(visual)
...@@ -2538,12 +2566,12 @@ class OneFormerTextMLP(nn.Module): ...@@ -2538,12 +2566,12 @@ class OneFormerTextMLP(nn.Module):
class OneFormerTextTransformerLayer(nn.Module): class OneFormerTextTransformerLayer(nn.Module):
def __init__(self, width: int, heads: int, attn_mask: torch.Tensor): def __init__(self, width: int, heads: int, attn_mask: torch.Tensor, layer_norm_eps=1e-05):
super().__init__() super().__init__()
self.self_attn = nn.MultiheadAttention(width, heads) self.self_attn = nn.MultiheadAttention(width, heads)
self.layer_norm1 = nn.LayerNorm(width) self.layer_norm1 = nn.LayerNorm(width, eps=layer_norm_eps)
self.mlp = OneFormerTextMLP(width, width * 4, width) self.mlp = OneFormerTextMLP(width, width * 4, width)
self.layer_norm2 = nn.LayerNorm(width) self.layer_norm2 = nn.LayerNorm(width, eps=layer_norm_eps)
self.attn_mask = attn_mask self.attn_mask = attn_mask
def forward( def forward(
...@@ -2572,11 +2600,21 @@ class OneFormerTextTransformerLayer(nn.Module): ...@@ -2572,11 +2600,21 @@ class OneFormerTextTransformerLayer(nn.Module):
class OneFormerTextTransformer(nn.Module): class OneFormerTextTransformer(nn.Module):
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False): def __init__(
self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_checkpoint=False,
layer_norm_eps=1e-05,
):
super().__init__() super().__init__()
self.width = width self.width = width
self.num_layers = layers self.num_layers = layers
self.layers = nn.Sequential(*[OneFormerTextTransformerLayer(width, heads, attn_mask) for _ in range(layers)]) self.layers = nn.Sequential(
*[OneFormerTextTransformerLayer(width, heads, attn_mask, layer_norm_eps) for _ in range(layers)]
)
self.use_checkpoint = use_checkpoint self.use_checkpoint = use_checkpoint
def forward(self, hidden_states: torch.Tensor): def forward(self, hidden_states: torch.Tensor):
...@@ -2596,6 +2634,7 @@ class OneFormerTextEncoder(nn.Module): ...@@ -2596,6 +2634,7 @@ class OneFormerTextEncoder(nn.Module):
layers: int, layers: int,
vocab_size, vocab_size,
use_checkpoint=False, use_checkpoint=False,
layer_norm_eps=1e-05,
): ):
super().__init__() super().__init__()
heads = width // 64 heads = width // 64
...@@ -2607,10 +2646,11 @@ class OneFormerTextEncoder(nn.Module): ...@@ -2607,10 +2646,11 @@ class OneFormerTextEncoder(nn.Module):
heads=heads, heads=heads,
attn_mask=self.build_attention_mask(), attn_mask=self.build_attention_mask(),
use_checkpoint=use_checkpoint, use_checkpoint=use_checkpoint,
layer_norm_eps=layer_norm_eps,
) )
self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width)) self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
self.ln_final = nn.LayerNorm(width) self.ln_final = nn.LayerNorm(width, eps=layer_norm_eps)
self.token_embedding = nn.Embedding(vocab_size, width) self.token_embedding = nn.Embedding(vocab_size, width)
def build_attention_mask(self): def build_attention_mask(self):
...@@ -2641,6 +2681,7 @@ class OneFormerTextMapper(nn.Module): ...@@ -2641,6 +2681,7 @@ class OneFormerTextMapper(nn.Module):
width=config.text_encoder_width, width=config.text_encoder_width,
layers=config.text_encoder_num_layers, layers=config.text_encoder_num_layers,
vocab_size=config.text_encoder_vocab_size, vocab_size=config.text_encoder_vocab_size,
layer_norm_eps=config.layer_norm_eps,
) )
self.text_projector = OneFormerMLPPredictionHead( self.text_projector = OneFormerMLPPredictionHead(
......
...@@ -66,8 +66,9 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -66,8 +66,9 @@ class OwlViTTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
...@@ -103,7 +104,7 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -103,7 +104,7 @@ class OwlViTTextConfig(PretrainedConfig):
num_attention_heads=8, num_attention_heads=8,
max_position_embeddings=16, max_position_embeddings=16,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
...@@ -173,8 +174,9 @@ class OwlViTVisionConfig(PretrainedConfig): ...@@ -173,8 +174,9 @@ class OwlViTVisionConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
defaults to 1e-5): The epsilon used by the layer normalization layers. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0): dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -212,7 +214,7 @@ class OwlViTVisionConfig(PretrainedConfig): ...@@ -212,7 +214,7 @@ class OwlViTVisionConfig(PretrainedConfig):
image_size=768, image_size=768,
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -476,9 +476,9 @@ class OwlViTEncoderLayer(nn.Module): ...@@ -476,9 +476,9 @@ class OwlViTEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = OwlViTAttention(config) self.self_attn = OwlViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = OwlViTMLP(config) self.mlp = OwlViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -790,7 +790,7 @@ class OwlViTTextTransformer(nn.Module): ...@@ -790,7 +790,7 @@ class OwlViTTextTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = OwlViTTextEmbeddings(config) self.embeddings = OwlViTTextEmbeddings(config)
self.encoder = OwlViTEncoder(config) self.encoder = OwlViTEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
...@@ -922,9 +922,9 @@ class OwlViTVisionTransformer(nn.Module): ...@@ -922,9 +922,9 @@ class OwlViTVisionTransformer(nn.Module):
self.config = config self.config = config
self.embeddings = OwlViTVisionEmbeddings(config) self.embeddings = OwlViTVisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(config.hidden_size) self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.encoder = OwlViTEncoder(config) self.encoder = OwlViTEncoder(config)
self.post_layernorm = nn.LayerNorm(config.hidden_size) self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
...@@ -1318,7 +1318,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel): ...@@ -1318,7 +1318,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
self.class_head = OwlViTClassPredictionHead(config) self.class_head = OwlViTClassPredictionHead(config)
self.box_head = OwlViTBoxPredictionHead(config) self.box_head = OwlViTBoxPredictionHead(config)
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size) self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
self.sigmoid = nn.Sigmoid() self.sigmoid = nn.Sigmoid()
def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor): def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
......
...@@ -95,7 +95,7 @@ class XCLIPTextConfig(PretrainedConfig): ...@@ -95,7 +95,7 @@ class XCLIPTextConfig(PretrainedConfig):
num_attention_heads=8, num_attention_heads=8,
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
...@@ -220,7 +220,7 @@ class XCLIPVisionConfig(PretrainedConfig): ...@@ -220,7 +220,7 @@ class XCLIPVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
num_frames=8, num_frames=8,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=0.00001, layer_norm_eps=1e-5,
dropout=0.0, dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
......
...@@ -311,9 +311,9 @@ class XCLIPEncoderLayer(nn.Module): ...@@ -311,9 +311,9 @@ class XCLIPEncoderLayer(nn.Module):
super().__init__() super().__init__()
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = XCLIPAttention(config) self.self_attn = XCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = XCLIPMLP(config) self.mlp = XCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -403,15 +403,15 @@ class XCLIPVisionEncoderLayer(nn.Module): ...@@ -403,15 +403,15 @@ class XCLIPVisionEncoderLayer(nn.Module):
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.message_fc = nn.Linear(self.embed_dim, self.embed_dim) self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
self.message_ln = nn.LayerNorm(self.embed_dim) self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.message_attn = XCLIPAttention(config) self.message_attn = XCLIPAttention(config)
self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
self.self_attn = XCLIPAttention(config) self.self_attn = XCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = XCLIPMLP(config) self.mlp = XCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward( def forward(
self, self,
...@@ -744,7 +744,7 @@ class XCLIPTextTransformer(nn.Module): ...@@ -744,7 +744,7 @@ class XCLIPTextTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = XCLIPTextEmbeddings(config) self.embeddings = XCLIPTextEmbeddings(config)
self.encoder = XCLIPEncoder(config) self.encoder = XCLIPEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
...@@ -989,9 +989,9 @@ class XCLIPVisionTransformer(nn.Module): ...@@ -989,9 +989,9 @@ class XCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size embed_dim = config.hidden_size
self.embeddings = XCLIPVisionEmbeddings(config) self.embeddings = XCLIPVisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(embed_dim) self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = XCLIPVisionEncoder(config) self.encoder = XCLIPVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
...@@ -1218,8 +1218,8 @@ class PromptGeneratorLayer(nn.Module): ...@@ -1218,8 +1218,8 @@ class PromptGeneratorLayer(nn.Module):
embed_dim = config.projection_dim embed_dim = config.projection_dim
self.cross_attn = XCLIPCrossAttention(config) self.cross_attn = XCLIPCrossAttention(config)
self.norm1 = nn.LayerNorm(embed_dim) self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.norm3 = nn.LayerNorm(embed_dim) self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.mlp = nn.Sequential( self.mlp = nn.Sequential(
nn.Linear(embed_dim, embed_dim * 4), nn.Linear(embed_dim, embed_dim * 4),
ACT2FN[config.prompt_hidden_act], ACT2FN[config.prompt_hidden_act],
...@@ -1239,7 +1239,7 @@ class XCLIPPromptGenerator(nn.Module): ...@@ -1239,7 +1239,7 @@ class XCLIPPromptGenerator(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
embed_dim = config.projection_dim embed_dim = config.projection_dim
self.layernorm = nn.LayerNorm(embed_dim) self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)]) self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha) self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
...@@ -1284,7 +1284,7 @@ class XCLIPModel(XCLIPPreTrainedModel): ...@@ -1284,7 +1284,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim) self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim)) self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
mit_config = copy(vision_config) mit_config = copy(vision_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment