Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = [
'../../_base_/models/resnet50.py',
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/schedules/imagenet_sgd_coslr_100e.py',
'../../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=128)
model = dict(
backbone=dict(
frozen_stages=4,
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.4, momentum=0.9, weight_decay=0.))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
_base_ = [
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=128)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='MoCoV3ViT',
arch='base', # embed_dim = 768
img_size=224,
patch_size=16,
stop_grad_conv1=True,
frozen_stages=12,
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
init_cfg=dict(type='Normal', std=0.01, layer='Linear'),
))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=12, momentum=0.9, weight_decay=0.))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/default_runtime.py',
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW', lr=5e-4, eps=1e-8, betas=(0.9, 0.999),
weight_decay=0.05),
clip_grad=dict(max_norm=5.0),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-3,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=145,
eta_min=1e-5,
by_epoch=True,
begin=5,
end=150,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=150)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
randomness = dict(seed=0)
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/default_runtime.py',
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='large',
img_size=224,
patch_size=16,
drop_path_rate=0.5,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW', lr=5e-4, eps=1e-8, betas=(0.9, 0.999),
weight_decay=0.05),
clip_grad=dict(max_norm=5.0),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-3,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
eta_min=1e-5,
by_epoch=True,
begin=5,
end=100,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
randomness = dict(seed=0)
_base_ = [
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=128)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='MoCoV3ViT',
arch='mocov3-small', # embed_dim = 384
img_size=224,
patch_size=16,
stop_grad_conv1=True,
frozen_stages=12,
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
init_cfg=dict(type='Normal', std=0.01, layer='Linear'),
))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=12, momentum=0.9, weight_decay=0.))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
Collections:
- Name: MoCoV3
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- LARS
Training Resources: 32x V100 GPUs
Architecture:
- ResNet
- ViT
- MoCo
Paper:
Title: An Empirical Study of Training Self-Supervised Vision Transformers
URL: https://arxiv.org/abs/2104.02057
README: configs/mocov3/README.md
Models:
- Name: mocov3_resnet50_8xb512-amp-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 4096
FLOPs: 4109364224
Parameters: 68012160
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth
Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
Downstream:
- resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k
- Name: mocov3_resnet50_8xb512-amp-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 4096
FLOPs: 4109364224
Parameters: 68012160
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth
Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py
Downstream:
- resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k
- Name: mocov3_resnet50_8xb512-amp-coslr-800e_in1k
Metadata:
Epochs: 800
Batch Size: 4096
FLOPs: 4109364224
Parameters: 68012160
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth
Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py
Downstream:
- resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k
- Name: resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 1024
FLOPs: 4109464576
Parameters: 25557032
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 69.6
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth
Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
- Name: resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 1024
FLOPs: 4109464576
Parameters: 25557032
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 72.8
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.pth
Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
- Name: resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 1024
FLOPs: 4109464576
Parameters: 25557032
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 74.4
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.pth
Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
- Name: mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 4096
FLOPs: 4607954304
Parameters: 84266752
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth
Config: configs/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py
Downstream:
- vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
- Name: vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 1024
FLOPs: 4607954304
Parameters: 22050664
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 73.6
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.pth
Config: configs/mocov3/benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py
- Name: mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 4096
FLOPs: 17581972224
Parameters: 215678464
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth
Config: configs/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py
Downstream:
- vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
- vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k
- Name: vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k
Metadata:
Epochs: 150
Batch Size: 512
FLOPs: 17581972224
Parameters: 86567656
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.0
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.pth
Config: configs/mocov3/benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py
- Name: vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 1024
FLOPs: 17581972224
Parameters: 86567656
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 76.9
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.pth
Config: configs/mocov3/benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py
- Name: mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 4096
FLOPs: 61603111936
Parameters: 652781568
Training Data: ImageNet-1k
In Collection: MoCoV3
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth
Config: configs/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py
Downstream:
- vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k
- Name: vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 512
FLOPs: 61603111936
Parameters: 304326632
Training Data: ImageNet-1k
In Collection: MoCoV3
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.7
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.pth
Config: configs/mocov3/benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# model settings
temperature = 1.0
model = dict(
type='MoCoV3',
base_momentum=0.01, # 0.01 for 100e and 300e, 0.004 for 1000e
backbone=dict(
type='ResNet',
depth=50,
norm_cfg=dict(type='SyncBN'),
zero_init_residual=False),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=True),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=False,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='LARS', lr=9.6, weight_decay=1e-6, momentum=0.9),
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lars_exclude=True),
'bias': dict(decay_mult=0, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(decay_mult=0, lars_exclude=True),
}),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=90,
by_epoch=True,
begin=10,
end=100,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# model settings
temperature = 1.0
model = dict(
type='MoCoV3',
base_momentum=0.01, # 0.01 for 100e and 300e, 0.004 for 1000e
backbone=dict(
type='ResNet',
depth=50,
norm_cfg=dict(type='SyncBN'),
zero_init_residual=False),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=True),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=False,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='LARS', lr=4.8, weight_decay=1e-6, momentum=0.9),
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lars_exclude=True),
'bias': dict(decay_mult=0, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(decay_mult=0, lars_exclude=True),
}),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=290,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# model settings
temperature = 1.0
model = dict(
type='MoCoV3',
base_momentum=0.004, # 0.01 for 100e and 300e, 0.004 for 800 and 1000e
backbone=dict(
type='ResNet',
depth=50,
norm_cfg=dict(type='SyncBN'),
zero_init_residual=False),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=True),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=False,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='LARS', lr=4.8, weight_decay=1.5e-6, momentum=0.9),
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lars_exclude=True),
'bias': dict(decay_mult=0, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(decay_mult=0, lars_exclude=True),
}),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=790,
by_epoch=True,
begin=10,
end=800,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=800)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# dataset settings
# the difference between ResNet50 and ViT pipeline is the `scale` in
# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
view_pipeline1 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=1.),
dict(type='Solarize', thr=128, prob=0.),
dict(type='RandomFlip', prob=0.5),
]
view_pipeline2 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=0.1),
dict(type='Solarize', thr=128, prob=0.2),
dict(type='RandomFlip', prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiView',
num_views=[1, 1],
transforms=[view_pipeline1, view_pipeline2]),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=256, dataset=dict(pipeline=train_pipeline))
# model settings
temperature = 0.2
model = dict(
type='MoCoV3',
base_momentum=0.01,
backbone=dict(
type='MoCoV3ViT',
arch='base', # embed_dim = 768
img_size=224,
patch_size=16,
stop_grad_conv1=True),
neck=dict(
type='NonLinearNeck',
in_channels=768,
hid_channels=4096,
out_channels=256,
num_layers=3,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# dataset settings
# the difference between ResNet50 and ViT pipeline is the `scale` in
# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
view_pipeline1 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=1.),
dict(type='Solarize', thr=128, prob=0.),
dict(type='RandomFlip', prob=0.5),
]
view_pipeline2 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=0.1),
dict(type='Solarize', thr=128, prob=0.2),
dict(type='RandomFlip', prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiView',
num_views=[1, 1],
transforms=[view_pipeline1, view_pipeline2]),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=64, dataset=dict(pipeline=train_pipeline))
# model settings
temperature = 0.2
model = dict(
type='MoCoV3',
base_momentum=0.01,
backbone=dict(
type='MoCoV3ViT',
arch='large', # embed_dim = 1024
img_size=224,
patch_size=16,
stop_grad_conv1=True),
neck=dict(
type='NonLinearNeck',
in_channels=1024,
hid_channels=4096,
out_channels=256,
num_layers=3,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
clip_grad=dict(max_norm=5.0, error_if_nonfinite=False),
optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
randomness = dict(seed=0)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs512_mocov3.py',
'../_base_/default_runtime.py',
]
# dataset settings
# the difference between ResNet50 and ViT pipeline is the `scale` in
# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
view_pipeline1 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=1.),
dict(type='Solarize', thr=128, prob=0.),
dict(type='RandomFlip', prob=0.5),
]
view_pipeline2 = [
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.08, 1.),
backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(
type='GaussianBlur',
magnitude_range=(0.1, 2.0),
magnitude_std='inf',
prob=0.1),
dict(type='Solarize', thr=128, prob=0.2),
dict(type='RandomFlip', prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiView',
num_views=[1, 1],
transforms=[view_pipeline1, view_pipeline2]),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=256, dataset=dict(pipeline=train_pipeline))
# model settings
temperature = 0.2
model = dict(
type='MoCoV3',
base_momentum=0.01,
backbone=dict(
type='MoCoV3ViT',
arch='mocov3-small', # embed_dim = 384
img_size=224,
patch_size=16,
stop_grad_conv1=True),
neck=dict(
type='NonLinearNeck',
in_channels=384,
hid_channels=4096,
out_channels=256,
num_layers=3,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
head=dict(
type='MoCoV3Head',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=False,
with_last_bn=True,
with_last_bn_affine=False,
with_last_bias=False,
with_avg_pool=False),
loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
temperature=temperature))
# optimizer
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
# MViT V2
> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf)
<!-- [ALGORITHM] -->
## Abstract
In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video
classification, as well as object detection. We present an improved version of MViT that incorporates
decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture
in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where
it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where
it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art
performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as
well as 86.1% on Kinetics-400 video classification.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/180376227-755243fa-158e-4068-940a-416036519665.png" width="50%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mvitv2-tiny_3rdparty_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mvitv2-tiny_3rdparty_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/mvit/mvitv2-tiny_8xb256_in1k.py https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :----------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :-----------------------------------: | :----------------------------------------------------------------------------------: |
| `mvitv2-tiny_3rdparty_in1k`\* | From scratch | 24.17 | 4.70 | 82.33 | 96.15 | [config](mvitv2-tiny_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) |
| `mvitv2-small_3rdparty_in1k`\* | From scratch | 34.87 | 7.00 | 83.63 | 96.51 | [config](mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) |
| `mvitv2-base_3rdparty_in1k`\* | From scratch | 51.47 | 10.16 | 84.34 | 96.86 | [config](mvitv2-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) |
| `mvitv2-large_3rdparty_in1k`\* | From scratch | 217.99 | 43.87 | 85.25 | 97.14 | [config](mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/mvit). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@inproceedings{li2021improved,
title={MViTv2: Improved multiscale vision transformers for classification and detection},
author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
booktitle={CVPR},
year={2022}
}
```
Collections:
- Name: MViT V2
Metadata:
Architecture:
- Attention Dropout
- Convolution
- Dense Connections
- GELU
- Layer Normalization
- Scaled Dot-Product Attention
- Attention Pooling
Paper:
URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf
Title: 'MViTv2: Improved Multiscale Vision Transformers for Classification and Detection'
README: configs/mvit/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.24.0/mmcls/models/backbones/mvit.py
Version: v0.24.0
Models:
- Name: mvitv2-tiny_3rdparty_in1k
In Collection: MViT V2
Metadata:
FLOPs: 4703510768
Parameters: 24173320
Training Data:
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 82.33
Top 5 Accuracy: 96.15
Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth
Code: https://github.com/facebookresearch/mvit
Config: configs/mvit/mvitv2-tiny_8xb256_in1k.py
- Name: mvitv2-small_3rdparty_in1k
In Collection: MViT V2
Metadata:
FLOPs: 6997555136
Parameters: 34870216
Training Data:
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 83.63
Top 5 Accuracy: 96.51
Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth
Code: https://github.com/facebookresearch/mvit
Config: configs/mvit/mvitv2-small_8xb256_in1k.py
- Name: mvitv2-base_3rdparty_in1k
In Collection: MViT V2
Metadata:
FLOPs: 10157964400
Parameters: 51472744
Training Data:
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 84.34
Top 5 Accuracy: 96.86
Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth
Code: https://github.com/facebookresearch/mvit
Config: configs/mvit/mvitv2-base_8xb256_in1k.py
- Name: mvitv2-large_3rdparty_in1k
In Collection: MViT V2
Metadata:
FLOPs: 43868151412
Parameters: 217992952
Training Data:
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 85.25
Top 5 Accuracy: 97.14
Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth
Code: https://github.com/facebookresearch/mvit
Config: configs/mvit/mvitv2-large_8xb256_in1k.py
_base_ = [
'../_base_/models/mvit/mvitv2-base.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=256)
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=2.5e-4),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.pos_embed': dict(decay_mult=0.0),
'.rel_pos_h': dict(decay_mult=0.0),
'.rel_pos_w': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=1.0),
)
# learning policy
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
end=70,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
]
# NOTE: `auto_scale_lr` is for automatically scaling LR,
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
_base_ = [
'../_base_/models/mvit/mvitv2-large.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=256)
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=2.5e-4),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.pos_embed': dict(decay_mult=0.0),
'.rel_pos_h': dict(decay_mult=0.0),
'.rel_pos_w': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=1.0),
)
# learning policy
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
end=70,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
]
# NOTE: `auto_scale_lr` is for automatically scaling LR,
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
_base_ = [
'../_base_/models/mvit/mvitv2-small.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=256)
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=2.5e-4),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.pos_embed': dict(decay_mult=0.0),
'.rel_pos_h': dict(decay_mult=0.0),
'.rel_pos_w': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=1.0),
)
# learning policy
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
end=70,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
]
# NOTE: `auto_scale_lr` is for automatically scaling LR,
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
_base_ = [
'../_base_/models/mvit/mvitv2-tiny.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=256)
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=2.5e-4),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.pos_embed': dict(decay_mult=0.0),
'.rel_pos_h': dict(decay_mult=0.0),
'.rel_pos_w': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=1.0),
)
# learning policy
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
end=70,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
]
# NOTE: `auto_scale_lr` is for automatically scaling LR,
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
# OFA
> [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework](https://arxiv.org/abs/2202.03052)
<!-- [ALGORITHM] -->
## Abstract
In this work, we pursue a unified paradigm for multimodal pretraining to break the scaffolds of complex task/modality-specific customization. We propose OFA, a Task-Agnostic and Modality-Agnostic framework that supports Task Comprehensiveness. OFA unifies a diverse set of cross-modal and unimodal tasks, including image generation, visual grounding, image captioning, image classification, language modeling, etc., in a simple sequence-to-sequence learning framework. OFA follows the instruction-based learning in both pretraining and finetuning stages, requiring no extra task-specific layers for downstream tasks. In comparison with the recent state-of-the-art vision & language models that rely on extremely large cross-modal datasets, OFA is pretrained on only 20M publicly available image-text pairs. Despite its simplicity and relatively small-scale training data, OFA achieves new SOTAs in a series of cross-modal tasks while attaining highly competitive performances on uni-modal tasks. Our further analysis indicates that OFA can also effectively transfer to unseen tasks and unseen domains.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/236164275-2429bf20-6e2a-4325-acc2-6117f9b53a53.png" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
from mmpretrain import inference_model
result = inference_model('ofa-base_3rdparty-finetuned_caption', 'demo/cat-dog.png')
print(result)
# {'pred_caption': 'a dog and a kitten sitting next to each other'}
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/ofa/ofa-base_finetuned_refcoco.py https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Caption on COCO
| Model | Params (M) | BLEU-4 | CIDER | Config | Download |
| :-------------------------------------- | :--------: | :----: | :----: | :-------------------------------------: | :--------------------------------------------------------------------------------------------------: |
| `ofa-base_3rdparty-finetuned_caption`\* | 182.24 | 42.64 | 144.50 | [config](ofa-base_finetuned_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-caption_20230418-de18914e.pth) |
*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
### Visual Grounding on RefCOCO
| Model | Params (M) | Accuracy (testA) | Accuracy (testB) | Config | Download |
| :-------------------------------------- | :--------: | :--------------: | :--------------: | :-------------------------------------: | :------------------------------------------------------------------------------: |
| `ofa-base_3rdparty-finetuned_refcoco`\* | 182.24 | 90.49 | 83.63 | [config](ofa-base_finetuned_refcoco.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth) |
*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
### Visual Question Answering on VQAv2
| Model | Params (M) | Accuracy | Config | Download |
| :---------------------------------- | :--------: | :------: | :---------------------------------: | :--------------------------------------------------------------------------------------------------------------: |
| `ofa-base_3rdparty-finetuned_vqa`\* | 182.24 | 78.00 | [config](ofa-base_finetuned_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-vqa_20230418-f38539a5.pth) |
| `ofa-base_3rdparty-zeroshot_vqa`\* | 182.24 | 58.32 | [config](ofa-base_zeroshot_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_pretrain_20230418-dccfc07f.pth) |
*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{wang2022ofa,
author = {Peng Wang and
An Yang and
Rui Men and
Junyang Lin and
Shuai Bai and
Zhikang Li and
Jianxin Ma and
Chang Zhou and
Jingren Zhou and
Hongxia Yang},
title = {OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence
Learning Framework},
journal = {CoRR},
volume = {abs/2202.03052},
year = {2022}
}
```
Collections:
- Name: OFA
Metadata:
Architecture:
- ResNet
- Transformer
Training Data:
- CC12M
- CC3M
- SBU
- COCO
- VG
- VQAv2
- GQA
- RefCOCO
- OpenImages
- Object365
- YFCC100M
- ImageNet-21K
- Pile
Paper:
Title: 'OFA: Unifying Architectures, Tasks, and Modalities Through a Simple
Sequence-to-Sequence Learning Framework'
URL: https://arxiv.org/abs/2202.03052
README: configs/ofa/README.md
Models:
- Name: ofa-base_3rdparty-finetuned_refcoco
Metadata:
FLOPs: null
Parameters: 182238536
In Collection: OFA
Results:
- Task: Visual Grounding
Dataset: RefCOCO
Metrics:
Accuracy (testA): 90.49
Accuracy (testB): 83.63
Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth
Config: configs/ofa/ofa-base_finetuned_refcoco.py
Converted From:
Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/refcoco_base_best.pt
Code: https://github.com/OFA-Sys/OFA
- Name: ofa-base_3rdparty-finetuned_vqa
Metadata:
FLOPs: null
Parameters: 182238536
In Collection: OFA
Results:
- Task: Visual Question Answering
Dataset: VQAv2
Metrics:
Accuracy: 78.00 # Report from the official repo
Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-vqa_20230418-f38539a5.pth
Config: configs/ofa/ofa-base_finetuned_vqa.py
Converted From:
Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/vqa_large_best.pt
Code: https://github.com/OFA-Sys/OFA
- Name: ofa-base_3rdparty-finetuned_caption
Metadata:
FLOPs: null
Parameters: 182238536
In Collection: OFA
Results:
- Task: Image Caption
Dataset: COCO
Metrics:
BLEU-4: 42.64
CIDER: 144.50
Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-caption_20230418-de18914e.pth
Config: configs/ofa/ofa-base_finetuned_caption.py
Converted From:
Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_base_best.pt
Code: https://github.com/OFA-Sys/OFA
- Name: ofa-base_3rdparty-zeroshot_vqa
Metadata:
FLOPs: null
Parameters: 182238536
In Collection: OFA
Results:
- Task: Visual Question Answering
Dataset: VQAv2
Metrics:
Accuracy: 58.32
Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_pretrain_20230418-dccfc07f.pth
Config: configs/ofa/ofa-base_zeroshot_vqa.py
Converted From:
Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/ofa_base.pt
Code: https://github.com/OFA-Sys/OFA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment