"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "ebd45980a0e57122eebce467e08f7c59a7c712eb"
Unverified Commit 4d10de55 authored by hamid mohammadi's avatar hamid mohammadi Committed by GitHub
Browse files

Feature to convert videomae huge and small finetuned on kinetics and ssv2...

Feature to convert videomae huge and small finetuned on kinetics and ssv2 added to the videomae to pytorch converter (#22788)

* Feature to convert videomae huge finetuned kinetics and videomae small finetuned kinetics and ssv2 added to videomae to pytorch converter

* Reformat convert_videomae_to_pytorch using black

* Value exception added for the possible videomae model architectures
parent 7579a52b
...@@ -33,15 +33,7 @@ from transformers import ( ...@@ -33,15 +33,7 @@ from transformers import (
def get_videomae_config(model_name): def get_videomae_config(model_name):
config = VideoMAEConfig() config = VideoMAEConfig()
if "large" in model_name: set_architecture_configs(model_name, config)
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 512
config.decoder_intermediate_size = 2048
if "finetuned" not in model_name: if "finetuned" not in model_name:
config.use_mean_pooling = False config.use_mean_pooling = False
...@@ -64,6 +56,38 @@ def get_videomae_config(model_name): ...@@ -64,6 +56,38 @@ def get_videomae_config(model_name):
return config return config
def set_architecture_configs(model_name, config):
if "small" in model_name:
config.hidden_size = 384
config.intermediate_size = 1536
config.num_hidden_layers = 12
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 3
config.decoder_hidden_size = 192
config.decoder_intermediate_size = 768
elif "large" in model_name:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 512
config.decoder_intermediate_size = 2048
elif "huge" in model_name:
config.hidden_size = 1280
config.intermediate_size = 5120
config.num_hidden_layers = 32
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 640
config.decoder_intermediate_size = 2560
elif "base" not in model_name:
raise ValueError('Model name should include either "small", "base", "large", or "huge"')
def rename_key(name): def rename_key(name):
if "encoder." in name: if "encoder." in name:
name = name.replace("encoder.", "") name = name.replace("encoder.", "")
...@@ -186,6 +210,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_ ...@@ -186,6 +210,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
logits = outputs.logits logits = outputs.logits
model_names = [ model_names = [
"videomae-small-finetuned-kinetics",
"videomae-small-finetuned-ssv2",
# Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600) # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
"videomae-base-short", "videomae-base-short",
"videomae-base-short-finetuned-kinetics", "videomae-base-short-finetuned-kinetics",
...@@ -193,6 +219,7 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_ ...@@ -193,6 +219,7 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
"videomae-base-finetuned-kinetics", "videomae-base-finetuned-kinetics",
"videomae-large", "videomae-large",
"videomae-large-finetuned-kinetics", "videomae-large-finetuned-kinetics",
"videomae-huge-finetuned-kinetics",
# Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400) # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
"videomae-base-short-ssv2", "videomae-base-short-ssv2",
"videomae-base-short-finetuned-ssv2", "videomae-base-short-finetuned-ssv2",
...@@ -201,7 +228,13 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_ ...@@ -201,7 +228,13 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
] ]
# NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5] # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
if model_name == "videomae-base": if model_name == "videomae-small-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
elif model_name == "videomae-small-finetuned-ssv2":
expected_shape = torch.Size([1, 174])
expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
elif model_name == "videomae-base":
expected_shape = torch.Size([1, 1408, 1536]) expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]]) expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
elif model_name == "videomae-base-short": elif model_name == "videomae-base-short":
...@@ -215,6 +248,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_ ...@@ -215,6 +248,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
elif model_name == "videomae-large-finetuned-kinetics": elif model_name == "videomae-large-finetuned-kinetics":
expected_shape = torch.Size([1, 400]) expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.0771, 0.0011, -0.3625]) expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
elif model_name == "videomae-huge-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
elif model_name == "videomae-base-short-finetuned-kinetics": elif model_name == "videomae-base-short-finetuned-kinetics":
expected_shape = torch.Size([1, 400]) expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.6588, 0.0990, -0.2493]) expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment