"docs/vscode:/vscode.git/clone" did not exist on "0410a29a2d5c798b2c0c1ca28398e0ddcf3384f2"
Unverified Commit 49e812d1 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[several models] improve readability (#24585)

* [modeling_clip.py] improve readability

* apply to other models

* fix
parent 134caef3
......@@ -1444,7 +1444,7 @@ class AlignModel(AlignPreTrainedModel):
self.vision_model = AlignVisionModel(vision_config)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim)
self.temperature = nn.Parameter(torch.ones([]) * self.config.temperature_init_value)
self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -1506,7 +1506,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -743,7 +743,7 @@ class BlipModel(BlipPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -1778,7 +1778,7 @@ class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel):
self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -1376,7 +1376,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -1956,8 +1956,8 @@ class ClapModel(ClapPreTrainedModel):
text_config = config.text_config
audio_config = config.audio_config
self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
self.logit_scale_a = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
self.logit_scale_t = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
self.projection_dim = config.projection_dim
......
......@@ -977,7 +977,7 @@ class CLIPModel(CLIPPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -979,7 +979,7 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -1229,7 +1229,7 @@ class FlavaModel(FlavaPreTrainedModel):
self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
......
......@@ -1368,7 +1368,7 @@ class GroupViTModel(GroupViTPreTrainedModel):
nn.ReLU(inplace=True),
nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -399,7 +399,7 @@ class OneFormerLoss(nn.Module):
self.importance_sample_ratio = importance_sample_ratio
self.contrastive_temperature = contrastive_temperature
if self.contrastive_temperature is not None:
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / contrastive_temperature))
self.logit_scale = nn.Parameter(torch.tensor(np.log(1 / contrastive_temperature)))
def _max_by_axis(self, the_list: List[List[int]]) -> List[int]:
maxes = the_list[0]
......
......@@ -1065,7 +1065,7 @@ class OwlViTModel(OwlViTPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
......
......@@ -204,7 +204,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
@add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
def get_text_features(
......
......@@ -1309,7 +1309,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment