Commit 495d9ed9 authored by limm's avatar limm
Browse files

add part code

parent 59b09903
Pipeline #2799 canceled with stages
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
out_type='avg_featmap',
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW', lr=2e-3, weight_decay=0.05, betas=(0.9, 0.999)),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
layer_decay_rate=0.65,
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
train_cfg = dict(by_epoch=True, max_epochs=100)
randomness = dict(seed=0, diff_rank_seed=True)
_base_ = [
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ToPIL', to_rgb=True),
dict(type='MAERandomResizedCrop', size=224, interpolation=3),
dict(type='torchvision/RandomHorizontalFlip', p=0.5),
dict(type='ToNumpy', to_bgr=True),
dict(type='PackInputs'),
]
# dataset settings
train_dataloader = dict(
batch_size=2048, drop_last=True, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
out_type='cls_token',
final_norm=True,
init_cfg=dict(type='Pretrained', prefix='backbone.')),
neck=dict(type='ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=80,
by_epoch=True,
begin=10,
end=90,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=90)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)
Collections:
- Name: MFF
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 8x A100-80G GPUs
Architecture:
- ViT
Paper:
Title: Improving Pixel-based MIM by Reducing Wasted Modeling Capability
URL: https://arxiv.org/pdf/2308.00261.pdf
README: configs/mff/README.md
Models:
- Name: mff_vit-base-p16_8xb512-amp-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 2048
FLOPs: 17581972224
Parameters: 85882692
Training Data: ImageNet-1k
In Collection: MaskFeat
Results: null
Weights: https://download.openmmlab.com/mmpretrain/v1.0/mff/mff_vit-base-p16_8xb512-amp-coslr-300e_in1k/mff_vit-base-p16_8xb512-amp-coslr-300e_in1k_20230801-3c1bcce4.pth
Config: configs/mff/mff_vit-base-p16_8xb512-amp-coslr-300e_in1k.py
Downstream:
- vit-base-p16_mff-300e-pre_8xb128-coslr-100e_in1k
- vit-base-p16_mff-300e-pre_8xb2048-linear-coslr-90e_in1k
- Name: mff_vit-base-p16_8xb512-amp-coslr-800e_in1k
Metadata:
Epochs: 800
Batch Size: 2048
FLOPs: 17581972224
Parameters: 85882692
Training Data: ImageNet-1k
In Collection: MaskFeat
Results: null
Weights: https://download.openmmlab.com/mmpretrain/v1.0/mff/mff_vit-base-p16_8xb512-amp-coslr-800e_in1k/mff_vit-base-p16_8xb512-amp-coslr-800e_in1k_20230801-3af7cd9d.pth
Config: configs/mff/mff_vit-base-p16_8xb512-amp-coslr-800e_in1k.py
Downstream:
- vit-base-p16_mff-800e-pre_8xb128-coslr-100e_in1k
- vit-base-p16_mff-800e-pre_8xb2048-linear-coslr-90e_in1k
- Name: vit-base-p16_mff-300e-pre_8xb128-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 1024
FLOPs: 17581215744
Parameters: 86566120
Training Data: ImageNet-1k
In Collection: MaskFeat
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.0
Weights: https://download.openmmlab.com/mmpretrain/v1.0/mff/mff_vit-base-p16_8xb512-amp-coslr-300e_in1k/vit-base-p16_8xb128-coslr-100e_in1k/vit-base-p16_8xb128-coslr-100e_in1k_20230802-d746fdb7.pth
Config: configs/mff/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: vit-base-p16_mff-800e-pre_8xb128-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 1024
FLOPs: 17581215744
Parameters: 86566120
Training Data: ImageNet-1k
In Collection: MFF
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.7
Weights: https://download.openmmlab.com/mmpretrain/v1.0/mff/mff_vit-base-p16_8xb512-amp-coslr-800e_in1k/vit-base-p16_8xb128-coslr-100e/vit-base-p16_8xb128-coslr-100e_20230802-6780e47d.pth
Config: configs/mff/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: vit-base-p16_mff-300e-pre_8xb2048-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 16384
FLOPs: 17581215744
Parameters: 86566120
Training Data: ImageNet-1k
In Collection: MFF
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 64.2
Weights:
Config: configs/mff/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-base-p16_mff-800e-pre_8xb2048-linear-coslr-90e_in1k
Metadata:
Epochs: 90
Batch Size: 16384
FLOPs: 17581215744
Parameters: 86566120
Training Data: ImageNet-1k
In Collection: MFF
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 68.3
Weights: https://download.openmmlab.com/mmpretrain/v1.0/mff/mff_vit-base-p16_8xb512-amp-coslr-300e_in1k/vit-base-p16_8xb128-coslr-100e_in1k/vit-base-p16_8xb128-coslr-100e_in1k_20230802-d746fdb7.pth
Config: configs/mff/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
_base_ = '../mae/mae_vit-base-p16_8xb512-amp-coslr-300e_in1k.py'
randomness = dict(seed=2, diff_rank_seed=True)
# dataset config
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ToPIL', to_rgb=True),
dict(type='torchvision/Resize', size=224),
dict(
type='torchvision/RandomCrop',
size=224,
padding=4,
padding_mode='reflect'),
dict(type='torchvision/RandomHorizontalFlip', p=0.5),
dict(type='ToNumpy', to_bgr=True),
dict(type='PackInputs')
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
# model config
model = dict(
type='MFF', backbone=dict(type='MFFViT', out_indices=[0, 2, 4, 6, 8, 11]))
_base_ = '../mae/mae_vit-base-p16_8xb512-amp-coslr-800e_in1k.py'
randomness = dict(seed=2, diff_rank_seed=True)
# dataset config
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ToPIL', to_rgb=True),
dict(type='torchvision/Resize', size=224),
dict(
type='torchvision/RandomCrop',
size=224,
padding=4,
padding_mode='reflect'),
dict(type='torchvision/RandomHorizontalFlip', p=0.5),
dict(type='ToNumpy', to_bgr=True),
dict(type='PackInputs')
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
# model config
model = dict(
type='MFF', backbone=dict(type='MFFViT', out_indices=[0, 2, 4, 6, 8, 11]))
# MILAN
> [MILAN: Masked Image Pretraining on Language Assisted Representation](https://arxiv.org/pdf/2208.06049)
<!-- [ALGORITHM] -->
## Abstract
Self-attention based transformer models have been dominating many computer
vision tasks in the past few years. Their superb model qualities heavily depend
on the excessively large labeled image datasets. In order to reduce the reliance
on large labeled datasets, reconstruction based masked autoencoders are gaining
popularity, which learn high quality transferable representations from unlabeled
images. For the same purpose, recent weakly supervised image pretraining methods
explore language supervision from text captions accompanying the images. In this
work, we propose masked image pretraining on language assisted representation,
dubbed as MILAN. Instead of predicting raw pixels or low level features, our
pretraining objective is to reconstruct the image features with substantial semantic
signals that are obtained using caption supervision. Moreover, to accommodate our
reconstruction target, we propose a more efficient prompting decoder architecture
and a semantic aware mask sampling mechanism, which further advance the
transfer performance of the pretrained model. Experimental results demonstrate
that MILAN delivers higher accuracy than the previous works. When the masked
autoencoder is pretrained and finetuned on ImageNet-1K dataset with an input
resolution of 224×224, MILAN achieves a top-1 accuracy of 85.4% on ViTB/16, surpassing previous state-of-the-arts by 1%. In the downstream semantic
segmentation task, MILAN achieves 52.7 mIoU using ViT-B/16 backbone on
ADE20K dataset, outperforming previous masked pretraining results by 4 points.
<div align=center>
<img src="https://user-images.githubusercontent.com/30762564/205210369-41a65c4c-bcd4-4147-91ea-c6c9061ab455.png" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('vit-base-p16_milan-pre_8xb128-coslr-100e_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('milan_vit-base-p16_16xb256-amp-coslr-400e_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k.py
```
Test:
```shell
python tools/test.py configs/milan/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :----------------------------------------------- | :--------: | :-------: | :---------------------------------------------------------: | :------------------------------------------------------------------------: |
| `milan_vit-base-p16_16xb256-amp-coslr-400e_in1k` | 111.91 | 17.58 | [config](milan_vit-base-p16_16xb256-amp-coslr-400e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.json) |
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Config | Download |
| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
| `vit-base-p16_milan-pre_8xb128-coslr-100e_in1k` | [MILAN](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth) | 86.57 | 17.58 | 85.30 | [config](benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.json) |
| `vit-base-p16_milan-pre_8xb2048-linear-coslr-100e_in1k` | [MILAN](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth) | 86.57 | 17.58 | 78.90 | [config](benchmarks/vit-base-p16_8xb2048-linear-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221129-03f26f85.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221129-03f26f85.json) |
## Citation
```bibtex
@article{Hou2022MILANMI,
title={MILAN: Masked Image Pretraining on Language Assisted Representation},
author={Zejiang Hou and Fei Sun and Yen-Kuang Chen and Yuan Xie and S. Y. Kung},
journal={ArXiv},
year={2022}
}
```
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
out_type='avg_featmap',
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW', lr=4e-4, weight_decay=0.05, betas=(0.9, 0.999)),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
layer_decay_rate=0.65,
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
_base_ = [
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
out_type='cls_token',
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=dict(type='ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]),
data_preprocessor=dict(
num_classes=1000,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=3.2, weight_decay=0.0, momentum=0.9),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=90,
by_epoch=True,
begin=10,
end=100,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)
Collections:
- Name: MILAN
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 16x A100-80G GPUs
Architecture:
- ViT
Paper:
Title: 'MILAN: Masked Image Pretraining on Language Assisted Representation'
URL: https://arxiv.org/pdf/2208.06049
README: configs/milan/README.md
Models:
- Name: milan_vit-base-p16_16xb256-amp-coslr-400e_in1k
Metadata:
Epochs: 400
Batch Size: 4096
FLOPs: 17581972224
Parameters: 111907584
Training Data: ImageNet-1k
In Collection: MILAN
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth
Config: configs/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k.py
Downstream:
- vit-base-p16_milan-pre_8xb128-coslr-100e_in1k
- vit-base-p16_milan-pre_8xb2048-linear-coslr-100e_in1k
- Name: vit-base-p16_milan-pre_8xb128-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 1024
FLOPs: 17581215744
Parameters: 86566120
Training Data: ImageNet-1k
In Collection: MILAN
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.3
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.pth
Config: configs/milan/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: vit-base-p16_milan-pre_8xb2048-linear-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 16384
FLOPs: 17581972992
Parameters: 86567656
Training Data: ImageNet-1k
In Collection: MILAN
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 78.9
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221129-03f26f85.pth
Config: configs/milan/benchmarks/vit-base-p16_8xb2048-linear-coslr-100e_in1k.py
_base_ = [
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='MILAN',
backbone=dict(
type='MILANViT',
arch='b',
patch_size=16,
mask_ratio=0.75,
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='MILANPretrainDecoder',
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
head=dict(
type='MIMHead',
loss=dict(
type='CosineSimilarityLoss', shift_factor=2.0, scale_factor=2.0),
),
target_generator=dict(
type='CLIPGenerator',
tokenizer_path= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/clip_vit_base_16.pth.tar' # noqa
),
init_cfg=None)
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
# MiniGPT4
> [MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models](https://arxiv.org/abs/2304.10592)
<!-- [ALGORITHM] -->
## Abstract
The recent GPT-4 has demonstrated extraordinary multi-modal abilities, such as directly generating websites from handwritten text and identifying humorous elements within images. These features are rarely observed in previous vision-language models. We believe the primary reason for GPT-4's advanced multi-modal generation capabilities lies in the utilization of a more advanced large language model (LLM). To examine this phenomenon, we present MiniGPT-4, which aligns a frozen visual encoder with a frozen LLM, Vicuna, using just one projection layer. Our findings reveal that MiniGPT-4 possesses many capabilities similar to those exhibited by GPT-4 like detailed image description generation and website creation from hand-written drafts. Furthermore, we also observe other emerging capabilities in MiniGPT-4, including writing stories and poems inspired by given images, providing solutions to problems shown in images, teaching users how to cook based on food photos, etc. In our experiment, we found that only performing the pretraining on raw image-text pairs could produce unnatural language outputs that lack coherency including repetition and fragmented sentences. To address this problem, we curate a high-quality, well-aligned dataset in the second stage to finetune our model using a conversational template. This step proved crucial for augmenting the model's generation reliability and overall usability. Notably, our model is highly computationally efficient, as we only train a projection layer utilizing approximately 5 million aligned image-text pairs. Our code, pre-trained model, and collected dataset are available at https://minigpt-4.github.io/.
<div align=center>
<img src="https://github.com/open-mmlab/mmpretrain/assets/36138628/1d8f328d-6c91-493e-8992-29e84a0fc3c8" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
from mmpretrain import inference_model
result = inference_model('minigpt-4_vicuna-7b_caption', 'demo/cat-dog.png')
print(result)
# {'pred_caption': 'This image shows a small dog and a kitten sitting on a blanket in a field of flowers. The dog is looking up at the kitten with a playful expression on its face. The background is a colorful striped blanket, and there are flowers all around them. The image is well composed with the two animals sitting in the center of the frame, surrounded by the flowers and blanket.'}
```
<!-- [TABS-END] -->
## Models and results
For Vicuna model, please refer to [MiniGPT-4 page](https://github.com/Vision-CAIR/MiniGPT-4) for preparation guidelines.
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :------------------------------ | :--------: | :-------: | :----------------------------------------: | :----------------------------------------------------------------------------------------------------------: |
| `minigpt-4_baichuan-7b_caption` | 8094.77 | N/A | [config](minigpt-4_baichuan-7b_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/minigpt4/minigpt-4_linear_baichuan7b_20231011-5dca7ed6.pth) |
| `minigpt-4_vicuna-7b_caption`\* | 8121.32 | N/A | [config](minigpt-4_vicuna-7b_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/minigpt4/minigpt-4_linear_vicuna7b_20230615-714b5f52.pth) |
*Models with * are converted from the [official repo](https://github.com/Vision-CAIR/MiniGPT-4/tree/main). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{zhu2023minigpt,
title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
journal={arXiv preprint arXiv:2304.10592},
year={2023}
}
```
Collections:
- Name: MiniGPT4
Metadata:
Architecture:
- Transformer
- Gated Cross-Attention Dense
Paper:
Title: 'MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models'
URL: https://arxiv.org/abs/2304.10592
README: configs/minigpt4/README.md
Models:
- Name: minigpt-4_vicuna-7b_caption
Metadata:
FLOPs: null
Parameters: 8121315072
In Collection: MiniGPT4
Results:
- Task: Image Caption
Dataset: COCO
Metrics: null
Weights: https://download.openmmlab.com/mmclassification/v1/minigpt4/minigpt-4_linear_vicuna7b_20230615-714b5f52.pth
Config: configs/minigpt4/minigpt-4_vicuna-7b_caption.py
Converted From:
Weights: https://github.com/Vision-CAIR/MiniGPT-4/tree/main
Code: https://github.com/Vision-CAIR/MiniGPT-4/tree/main
- Name: minigpt-4_baichuan-7b_caption
Metadata:
FLOPs: null
Parameters: 8094769024
In Collection: MiniGPT4
Results:
- Task: Image Caption
Dataset: COCO
Metrics: null
Weights: https://download.openmmlab.com/mmclassification/v1/minigpt4/minigpt-4_linear_baichuan7b_20231011-5dca7ed6.pth
Config: configs/minigpt4/minigpt-4_baichuan-7b_caption.py
_base_ = [
'../_base_/default_runtime.py',
]
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
scale=(224, 224),
interpolation='bicubic',
backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='CleanCaption',
keys='chat_content',
remove_chars='',
lowercase=False),
dict(
type='PackInputs',
algorithm_keys=['chat_content', 'lang'],
meta_keys=['image_id']),
]
train_dataloader = dict(
batch_size=2,
num_workers=4,
dataset=dict(
type='MiniGPT4Dataset',
data_root='YOUR_DATA_DIRECTORY',
ann_file='YOUR_DATA_FILE',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
drop_last=False,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
scale=(224, 224),
interpolation='bicubic',
backend='pillow'),
dict(type='PackInputs', meta_keys=['image_id']),
]
test_evaluator = dict(
type='COCOCaption',
ann_file='data/coco/annotations/coco_karpathy_val_gt.json',
)
test_dataloader = dict(
batch_size=1,
dataset=dict(
type='COCOCaption',
data_root='data/coco',
ann_file='annotations/coco_karpathy_val.json',
pipeline=test_pipeline))
# model settings
model = dict(
type='MiniGPT4',
vision_encoder=dict(
type='BEiTViT',
# eva-g without the final layer
arch=dict(
embed_dims=1408,
num_layers=39,
num_heads=16,
feedforward_channels=6144,
),
img_size=224,
patch_size=14,
layer_scale_init_value=0.0,
frozen_stages=39,
use_abs_pos_emb=True,
use_rel_pos_bias=False,
final_norm=False,
use_shared_rel_pos_bias=False,
out_type='raw',
pretrained= # noqa
'https://download.openmmlab.com/mmpretrain/v1.0/minigpt4/minigpt-4_eva-g-p14_20230615-e908c021.pth' # noqa
),
q_former_model=dict(
type='Qformer',
model_style='bert-base-uncased',
vision_model_width=1408,
add_cross_attention=True,
cross_attention_freq=2,
num_query_token=32,
pretrained= # noqa
'https://download.openmmlab.com/mmpretrain/v1.0/minigpt4/minigpt-4_qformer_20230615-1dfa889c.pth' # noqa
),
lang_encoder=dict(
type='AutoModelForCausalLM',
name_or_path='baichuan-inc/baichuan-7B',
trust_remote_code=True),
tokenizer=dict(
type='AutoTokenizer',
name_or_path='baichuan-inc/baichuan-7B',
trust_remote_code=True),
task='caption',
prompt_template=dict([('en', '###Ask: {} ###Answer: '),
('zh', '###问:{} ###答:')]),
raw_prompts=dict([
('en', [('<Img><ImageHere></Img> '
'Describe this image in detail.'),
('<Img><ImageHere></Img> '
'Take a look at this image and describe what you notice.'),
('<Img><ImageHere></Img> '
'Please provide a detailed description of the picture.'),
('<Img><ImageHere></Img> '
'Could you describe the contents of this image for me?')]),
('zh', [('<Img><ImageHere></Img> '
'详细描述这张图片。'), ('<Img><ImageHere></Img> '
'浏览这张图片并描述你注意到什么。'),
('<Img><ImageHere></Img> '
'请对这张图片进行详细的描述。'),
('<Img><ImageHere></Img> '
'你能为我描述这张图片的内容吗?')])
]),
max_txt_len=160,
end_sym='###')
strategy = dict(
type='DeepSpeedStrategy',
fp16=dict(
enabled=True,
auto_cast=False,
fp16_master_weights_and_grads=False,
loss_scale=0,
loss_scale_window=1000,
hysteresis=1,
min_loss_scale=1,
initial_scale_power=16,
),
inputs_to_half=[0],
zero_optimization=dict(
stage=2,
allgather_partitions=True,
allgather_bucket_size=2e8,
reduce_scatter=True,
reduce_bucket_size='auto',
overlap_comm=True,
contiguous_gradients=True,
),
)
# schedule settings
optim_wrapper = dict(
type='DeepSpeedOptimWrapper',
optimizer=dict(type='AdamW', lr=1e-3, weight_decay=0.05))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-3 / 500,
by_epoch=False,
begin=0,
end=500,
),
dict(
type='CosineAnnealingLR',
eta_min=2e-4,
by_epoch=False,
begin=500,
),
]
train_cfg = dict(by_epoch=True, max_epochs=6)
test_cfg = dict()
runner_type = 'FlexibleRunner'
default_hooks = dict(
checkpoint=dict(
type='CheckpointHook',
interval=1,
by_epoch=True,
save_last=True,
max_keep_ckpts=1,
))
_base_ = [
'../_base_/datasets/coco_caption.py',
'../_base_/default_runtime.py',
]
# dataset settings
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
scale=(224, 224),
interpolation='bicubic',
backend='pillow'),
dict(type='PackInputs', meta_keys=['image_id']),
]
val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='MiniGPT4',
vision_encoder=dict(
type='BEiTViT',
# eva-g without the final layer
arch=dict(
embed_dims=1408,
num_layers=39,
num_heads=16,
feedforward_channels=6144,
),
img_size=224,
patch_size=14,
layer_scale_init_value=0.0,
frozen_stages=39,
use_abs_pos_emb=True,
use_rel_pos_bias=False,
final_norm=False,
use_shared_rel_pos_bias=False,
out_type='raw',
pretrained= # noqa
'https://download.openmmlab.com/mmpretrain/v1.0/minigpt4/minigpt-4_eva-g-p14_20230615-e908c021.pth' # noqa
),
q_former_model=dict(
type='Qformer',
model_style='bert-base-uncased',
vision_model_width=1408,
add_cross_attention=True,
cross_attention_freq=2,
num_query_token=32,
pretrained= # noqa
'https://download.openmmlab.com/mmpretrain/v1.0/minigpt4/minigpt-4_qformer_20230615-1dfa889c.pth' # noqa
),
lang_encoder=dict(
type='AutoModelForCausalLM', name_or_path='YOUR_PATH_TO_VICUNA'),
tokenizer=dict(type='LlamaTokenizer', name_or_path='YOUR_PATH_TO_VICUNA'),
task='caption',
prompt_template=dict([('en', '###Ask: {} ###Answer: '),
('zh', '###问:{} ###答:')]),
raw_prompts=dict([
('en', [('<Img><ImageHere></Img> '
'Describe this image in detail.'),
('<Img><ImageHere></Img> '
'Take a look at this image and describe what you notice.'),
('<Img><ImageHere></Img> '
'Please provide a detailed description of the picture.'),
('<Img><ImageHere></Img> '
'Could you describe the contents of this image for me?')]),
('zh', [('<Img><ImageHere></Img> '
'详细描述这张图片。'), ('<Img><ImageHere></Img> '
'浏览这张图片并描述你注意到什么。'),
('<Img><ImageHere></Img> '
'请对这张图片进行详细的描述。'),
('<Img><ImageHere></Img> '
'你能为我描述这张图片的内容吗?')])
]),
max_txt_len=160,
end_sym='###')
# schedule settings
optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
param_scheduler = [
dict(
type='CosineAnnealingLR',
by_epoch=True,
begin=0,
end=5,
)
]
train_cfg = dict(by_epoch=True, max_epochs=5)
val_cfg = dict()
test_cfg = dict()
# MixMIM
> [MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation Learning](https://arxiv.org/abs/2205.13137)
<!-- [ALGORITHM] -->
## Abstract
In this study, we propose Mixed and Masked Image Modeling (MixMIM), a
simple but efficient MIM method that is applicable to various hierarchical Vision
Transformers. Existing MIM methods replace a random subset of input tokens with
a special [MASK] symbol and aim at reconstructing original image tokens from
the corrupted image. However, we find that using the [MASK] symbol greatly
slows down the training and causes training-finetuning inconsistency, due to the
large masking ratio (e.g., 40% in BEiT). In contrast, we replace the masked tokens
of one image with visible tokens of another image, i.e., creating a mixed image.
We then conduct dual reconstruction to reconstruct the original two images from
the mixed input, which significantly improves efficiency. While MixMIM can
be applied to various architectures, this paper explores a simpler but stronger
hierarchical Transformer, and scales with MixMIM-B, -L, and -H. Empirical
results demonstrate that MixMIM can learn high-quality visual representations
efficiently. Notably, MixMIM-B with 88M parameters achieves 85.1% top-1
accuracy on ImageNet-1K by pretraining for 600 epochs, setting a new record for
neural networks with comparable model sizes (e.g., ViT-B) among MIM methods.
Besides, its transferring performances on the other 6 datasets show MixMIM has
better FLOPs / performance tradeoff than previous MIM methods
<div align=center>
<img src="https://user-images.githubusercontent.com/56866854/202853730-d26fb3d7-e5e8-487a-aad5-e3d4600cef87.png"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mixmim_mixmim-base_16xb128-coslr-300e_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mixmim/mixmim_mixmim-base_16xb128-coslr-300e_in1k.py
```
Test:
```shell
python tools/test.py configs/mixmim/benchmarks/mixmim-base_8xb128-coslr-100e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :------------------------------------------- | :--------: | :-------: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------: |
| `mixmim_mixmim-base_16xb128-coslr-300e_in1k` | 114.67 | 16.35 | [config](mixmim_mixmim-base_16xb128-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.json) |
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Config | Download |
| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
| `mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k` | [MIXMIM](https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.pth) | 88.34 | 16.35 | 84.63 | [config](benchmarks/mixmim-base_8xb128-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.json) |
## Citation
```bibtex
@article{MixMIM2022,
author = {Jihao Liu, Xin Huang, Yu Liu, Hongsheng Li},
journal = {arXiv:2205.13137},
title = {MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation Learning},
year = {2022},
}
```
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
)
bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackInputs'),
]
train_dataloader = dict(
batch_size=128,
num_workers=16,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
val_dataloader = dict(
batch_size=64,
num_workers=8,
pin_memory=True,
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_dataloader = val_dataloader
model = dict(
backbone=dict(
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=5e-4 * (8 * 128 / 256),
betas=(0.9, 0.999),
weight_decay=0.05),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
layer_decay_rate=0.7,
custom_keys={
'.ln': dict(decay_mult=0.0), # do not decay on ln and bias
'.bias': dict(decay_mult=0.0)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-6,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
eta_min=1e-6,
by_epoch=True,
begin=5,
end=100,
convert_to_iter_based=True)
]
train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=10)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs256.py',
'../../_base_/default_runtime.py'
]
Collections:
- Name: MixMIM
Metadata:
Architecture:
- Attention Dropout
- Convolution
- Dense Connections
- Dropout
- GELU
- Layer Normalization
- Multi-Head Attention
- Scaled Dot-Product Attention
- Tanh Activation
Paper:
Title: 'MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation
Learning'
URL: https://arxiv.org/abs/2205.13137
README: configs/mixmim/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/mixmim.py
Version: v1.0.0rc4
Models:
- Name: mixmim_mixmim-base_16xb128-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 2048
FLOPs: 16351906816
Parameters: 114665784
Training Data: ImageNet-1k
In Collection: MixMIM
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.pth
Config: configs/mixmim/mixmim_mixmim-base_16xb128-coslr-300e_in1k.py
Downstream:
- mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
- Name: mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 1024
FLOPs: 16351906816
Parameters: 88344352
Training Data: ImageNet-1k
In Collection: MixMIM
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 84.63
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.pth
Config: configs/mixmim/benchmarks/mixmim-base_8xb128-coslr-100e_in1k.py
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.2, 1.0),
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5),
dict(type='PackInputs')
]
train_dataloader = dict(
batch_size=128,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# model settings
model = dict(
type='MixMIM',
backbone=dict(
type='MixMIMPretrainTransformer',
arch='B',
drop_rate=0.0,
drop_path_rate=0.0, # drop_path_rate=0.0 during pretraining
mask_ratio=0.5),
neck=dict(
type='MixMIMPretrainDecoder',
num_patches=49,
encoder_stride=32,
embed_dim=1024,
decoder_embed_dim=512,
decoder_depth=8,
decoder_num_heads=16),
head=dict(
type='MixMIMPretrainHead',
norm_pix=True,
loss=dict(type='PixelReconstructionLoss', criterion='L2')))
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * (2048 / 256),
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=1))
randomness = dict(seed=0, diff_rank_seed=True)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
# MLP-Mixer
> [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601)
<!-- [ALGORITHM] -->
## Abstract
Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Recently, attention-based networks, such as the Vision Transformer, have also become popular. In this paper we show that while convolutions and attention are both sufficient for good performance, neither of them are necessary. We present MLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs). MLP-Mixer contains two types of layers: one with MLPs applied independently to image patches (i.e. "mixing" the per-location features), and one with MLPs applied across patches (i.e. "mixing" spatial information). When trained on large datasets, or with modern regularization schemes, MLP-Mixer attains competitive scores on image classification benchmarks, with pre-training and inference cost comparable to state-of-the-art models. We hope that these results spark further research beyond the realms of well established CNNs and Transformers.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/143178327-7118b48a-5f5f-4844-a614-a571917384ca.png" width="90%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mlp-mixer-base-p16_3rdparty_64xb64_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mlp-mixer-base-p16_3rdparty_64xb64_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------------: | :-------------------------------------------------------------: |
| `mlp-mixer-base-p16_3rdparty_64xb64_in1k`\* | From scratch | 59.88 | 12.61 | 76.68 | 92.25 | [config](mlp-mixer-base-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth) |
| `mlp-mixer-large-p16_3rdparty_64xb64_in1k`\* | From scratch | 208.20 | 44.57 | 72.34 | 88.02 | [config](mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) |
*Models with * are converted from the [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@misc{tolstikhin2021mlpmixer,
title={MLP-Mixer: An all-MLP Architecture for Vision},
author={Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy},
year={2021},
eprint={2105.01601},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment