Unverified Commit d9850abd authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Fix `AssertionError` in clip conversion script (#30321)



* fix

* fix

* fix

* update comments

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 01ae3b87
...@@ -124,7 +124,15 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa ...@@ -124,7 +124,15 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
copy_vison_model_and_projection(hf_model, pt_model) copy_vison_model_and_projection(hf_model, pt_model)
hf_model.logit_scale = pt_model.logit_scale hf_model.logit_scale = pt_model.logit_scale
input_ids = torch.arange(0, 77).unsqueeze(0) # Use `eos_token` so the example is more meaningful
input_ids = torch.tensor(
[
[config.text_config.bos_token_id]
+ list(range(3, 77))
+ [config.text_config.eos_token_id]
+ [config.text_config.pad_token_id]
]
)
pixel_values = torch.randn(1, 3, 224, 224) pixel_values = torch.randn(1, 3, 224, 224)
hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True) hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
......
...@@ -734,6 +734,7 @@ class CLIPTextTransformer(nn.Module): ...@@ -734,6 +734,7 @@ class CLIPTextTransformer(nn.Module):
pooled_output = last_hidden_state[ pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
# Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
(input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
.int() .int()
.argmax(dim=-1), .argmax(dim=-1),
......
...@@ -736,6 +736,7 @@ class CLIPSegTextTransformer(nn.Module): ...@@ -736,6 +736,7 @@ class CLIPSegTextTransformer(nn.Module):
pooled_output = last_hidden_state[ pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
# Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
(input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
.int() .int()
.argmax(dim=-1), .argmax(dim=-1),
......
...@@ -1118,6 +1118,7 @@ class GroupViTTextTransformer(nn.Module): ...@@ -1118,6 +1118,7 @@ class GroupViTTextTransformer(nn.Module):
pooled_output = last_hidden_state[ pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
# Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
(input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
.int() .int()
.argmax(dim=-1), .argmax(dim=-1),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment