Commit d476eeba authored by renzhc's avatar renzhc
Browse files

upload mmpretrain

parent 62b8498e
Pipeline #1662 failed with stages
in 0 seconds
_base_ = [
'efficientnetv2-s_8xb32_in1k-384px.py',
]
# model setting
model = dict(backbone=dict(arch='l'), )
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
# model setting
model = dict(backbone=dict(arch='l'), )
_base_ = [
'efficientnetv2-s_8xb32_in1k-384px.py',
]
# model setting
model = dict(backbone=dict(arch='m'), )
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
# model setting
model = dict(backbone=dict(arch='m'), )
_base_ = [
'../_base_/models/efficientnet_v2/efficientnetv2_s.py',
'../_base_/datasets/imagenet_bs32.py',
'../_base_/schedules/imagenet_bs256.py',
'../_base_/default_runtime.py',
]
# dataset settings
dataset_type = 'ImageNet'
data_preprocessor = dict(
num_classes=1000,
# RGB format normalization parameters
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
# convert image from BGR to RGB
to_rgb=True,
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetRandomCrop', scale=300, crop_padding=0),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetCenterCrop', crop_size=384, crop_padding=0),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
_base_ = [
'../_base_/models/efficientnet_v2/efficientnetv2_s.py',
'../_base_/datasets/imagenet_bs32.py',
'../_base_/schedules/imagenet_bs256.py',
'../_base_/default_runtime.py',
]
# model setting
model = dict(head=dict(num_classes=21843))
# dataset settings
dataset_type = 'ImageNet21k'
data_preprocessor = dict(
num_classes=21843,
# RGB format normalization parameters
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
# convert image from BGR to RGB
to_rgb=True,
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetRandomCrop', scale=224),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetCenterCrop', crop_size=224, crop_padding=0),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
_base_ = [
'efficientnetv2-s_8xb32_in1k-384px.py',
]
# model setting
model = dict(backbone=dict(arch='xl'), )
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='EfficientNetCenterCrop', crop_size=512, crop_padding=0),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
# model setting
model = dict(backbone=dict(arch='xl'), )
Collections:
- Name: EfficientNetV2
Metadata:
Training Data: ImageNet-1k
Architecture:
- 1x1 Convolution
- Average Pooling
- Convolution
- Dense Connections
- Dropout
- Inverted Residual Block
- RMSProp
- Squeeze-and-Excitation Block
- Swish
Paper:
URL: https://arxiv.org/abs/2104.00298
Title: "EfficientNetV2: Smaller Models and Faster Training"
README: configs/efficientnet_v2/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/beit.py
Version: v1.0.0rc4
Models:
- Name: efficientnetv2-b0_3rdparty_in1k
Metadata:
FLOPs: 919843360
Parameters: 7139704
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 78.52
Top 5 Accuracy: 94.44
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b0_3rdparty_in1k_20221221-9ef6e736.pth
Config: configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-b1_3rdparty_in1k
Metadata:
FLOPs: 1438287552
Parameters: 8141052
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 79.80
Top 5 Accuracy: 94.89
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b1_3rdparty_in1k_20221221-6955d9ce.pth
Config: configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b1-be6e41b0.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-b2_3rdparty_in1k
Metadata:
FLOPs: 1986433080
Parameters: 10096086
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 80.63
Top 5 Accuracy: 95.30
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b2_3rdparty_in1k_20221221-74f7d493.pth
Config: configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b2-847de54e.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-b3_3rdparty_in1k
Metadata:
FLOPs: 3498068400
Parameters: 14358406
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.03
Top 5 Accuracy: 95.88
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b3_3rdparty_in1k_20221221-b6f07a36.pth
Config: configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b3-57773f13.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-s_3rdparty_in1k
Metadata:
FLOPs: 9719420928
Parameters: 21458488
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.82
Top 5 Accuracy: 96.67
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in1k_20221220-f0eaff9d.pth
Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s-eb54923e.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-m_3rdparty_in1k
Metadata:
FLOPs: 26880363584
Parameters: 54139356
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.01
Top 5 Accuracy: 97.26
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in1k_20221220-9dc0c729.pth
Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-l_3rdparty_in1k
Metadata:
FLOPs: 60142387008
Parameters: 118515272
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.43
Top 5 Accuracy: 97.31
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in1k_20221220-5c3bac0f.pth
Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-s_in21k-pre_3rdparty_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 9719420928
Parameters: 21458488
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 84.29
Top 5 Accuracy: 97.26
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_in21k-pre-3rdparty_in1k_20221220-7a7c8475.pth
Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-m_in21k-pre_3rdparty_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 26880363584
Parameters: 54139356
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.47
Top 5 Accuracy: 97.76
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_in21k-pre-3rdparty_in1k_20221220-a1013a04.pth
Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-l_in21k-pre_3rdparty_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 60142387008
Parameters: 118515272
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 86.31
Top 5 Accuracy: 97.99
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_in21k-pre-3rdparty_in1k_20221220-63df0efd.pth
Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-xl_in21k-pre_3rdparty_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 98341230592
Parameters: 208119808
In Collection: EfficientNetV2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 86.39
Top 5 Accuracy: 97.83
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_in21k-pre-3rdparty_in1k_20221220-583ac18b.pth
Config: configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-s_3rdparty_in21k
Metadata:
FLOPs: 3309720768
Parameters: 48158371
In Collection: EfficientNetV2
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in21k_20221220-c0572b56.pth
Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-m_3rdparty_in21k
Metadata:
FLOPs: 5861638208
Parameters: 80839239
In Collection: EfficientNetV2
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in21k_20221220-073e944c.pth
Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-l_3rdparty_in21k
Metadata:
FLOPs: 13114950464
Parameters: 145215155
In Collection: EfficientNetV2
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in21k_20221220-f28f91e1.pth
Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
- Name: efficientnetv2-xl_3rdparty_in21k
Metadata:
FLOPs: 18855244288
Parameters: 234819691
In Collection: EfficientNetV2
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_3rdparty_in21k_20221220-b2c9329c.pth
Config: configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
# EVA
> [EVA: Exploring the Limits of Masked Visual Representation Learning at Scale](https://arxiv.org/abs/2211.07636)
<!-- [ALGORITHM] -->
## Abstract
We launch EVA, a vision-centric foundation model to explore the limits of visual representation at scale using only publicly accessible data. EVA is a vanilla ViT pre-trained to reconstruct the masked out image-text aligned vision features conditioned on visible image patches. Via this pretext task, we can efficiently scale up EVA to one billion parameters, and sets new records on a broad range of representative vision downstream tasks, such as image recognition, video action recognition, object detection, instance segmentation and semantic segmentation without heavy supervised training. Moreover, we observe quantitative changes in scaling EVA result in qualitative changes in transfer learning performance that are not present in other models. For instance, EVA takes a great leap in the challenging large vocabulary instance segmentation task: our model achieves almost the same state-of-the-art performance on LVISv1.0 dataset with over a thousand categories and COCO dataset with only eighty categories. Beyond a pure vision encoder, EVA can also serve as a vision-centric, multi-modal pivot to connect images and text. We find initializing the vision tower of a giant CLIP from EVA can greatly stabilize the training and outperform the training from scratch counterpart with much fewer samples and less compute, providing a new direction for scaling up and accelerating the costly training of multi-modal foundation models.
<div align=center>
<img src="https://user-images.githubusercontent.com/24734142/205410193-f1164e56-c117-4165-86f5-4cbfd797bc87.png" width="70%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('vit-base-p16_eva-mae-style-pre_8xb128-coslr-100e_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k.py
```
Test:
```shell
python tools/test.py configs/eva/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221226-f61cf992.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :--------------------------------------------------- | :--------: | :-------: | :-------------------------------------------------------------: | :----------------------------------------------------------------: |
| `eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k` | 111.78 | 17.58 | [config](eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k_20221226-26d90f07.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k_20221226-26d90f07.json) |
| `beit-l-p14_3rdparty-eva_in21k`\* | 303.18 | 81.08 | [config](eva-l-p14_headless.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_3rdparty-mim_in21k_20221213-3a5da50b.pth) |
| `beit-l-p14_eva-pre_3rdparty_in21k`\* | 303.18 | 81.08 | [config](eva-l-p14_headless.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in21k_20221213-8f194fa2.pth) |
| `beit-g-p16_3rdparty-eva_30m`\* | 1011.32 | 203.52 | [config](eva-g-p16_headless.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p16_3rdparty_30m_20221213-7bed23ee.pth) |
| `beit-g-p14_3rdparty-eva_30m`\* | 1011.60 | 267.17 | [config](eva-g-p14_headless.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_3rdparty_30m_20221213-3b7aca97.pth) |
| `beit-g-p14_eva-30m-pre_3rdparty_in21k`\* | 1011.60 | 267.17 | [config](eva-g-p14_headless.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-pre_3rdparty_in21k_20221213-d72285b7.pth) |
*Models with * are converted from the [official repo](https://github.com/baaivision/EVA). The config files of these models are only for inference. We haven't reproduce the training results.*
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :-------------------------------------- | :----------------------------------------: | :--------: | :-------: | :-------: | :-------: | :--------------------------------------: | :----------------------------------------: |
| `vit-base-p16_eva-mae-style-pre_8xb128-coslr-100e_in1k` | [EVA MAE STYLE](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k_20221226-26d90f07.pth) | 86.57 | 17.58 | 83.70 | N/A | [config](benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221226-f61cf992.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221226-f61cf992.json) |
| `vit-base-p16_eva-mae-style-pre_8xb2048-linear-coslr-100e_in1k` | [EVA MAE STYLE](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k_20221226-26d90f07.pth) | 86.57 | 17.58 | 69.00 | N/A | [config](benchmarks/vit-base-p16_8xb2048-linear-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221226-ef51bf09.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221226-ef51bf09.json) |
| `beit-l-p14_eva-pre_3rdparty_in1k-196px`\* | [EVA](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_3rdparty-mim_in21k_20221213-3a5da50b.pth) | 304.14 | 61.57 | 87.94 | 98.5 | [config](eva-l-p14_8xb16_in1k-196px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in1k-196px_20221214-2adf4d28.pth) |
| `beit-l-p14_eva-in21k-pre_3rdparty_in1k-196px`\* | EVA ImageNet-21k | 304.14 | 61.57 | 88.58 | 98.65 | [config](eva-l-p14_8xb16_in1k-196px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-196px_20221213-b730c7e7.pth) |
| `beit-l-p14_eva-pre_3rdparty_in1k-336px`\* | [EVA](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_3rdparty-mim_in21k_20221213-3a5da50b.pth) | 304.53 | 191.10 | 88.66 | 98.75 | [config](eva-l-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in1k-336px_20221214-07785cfd.pth) |
| `beit-l-p14_eva-in21k-pre_3rdparty_in1k-336px`\* | EVA ImageNet-21k | 304.53 | 191.10 | 89.17 | 98.86 | [config](eva-l-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px_20221213-f25b7634.pth) |
| `beit-g-p14_eva-30m-in21k-pre_3rdparty_in1k-336px`\* | [EVA 30M ImageNet-21k](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-pre_3rdparty_in21k_20221213-d72285b7.pth) | 1013.01 | 620.64 | 89.61 | 98.93 | [config](eva-g-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-336px_20221213-210f9071.pth) |
| `beit-g-p14_eva-30m-in21k-pre_3rdparty_in1k-560px`\* | [EVA 30M ImageNet-21k](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-pre_3rdparty_in21k_20221213-d72285b7.pth) | 1014.45 | 1906.76 | 89.71 | 98.96 | [config](eva-g-p14_8xb16_in1k-560px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-560px_20221213-fa1c3652.pth) |
*Models with * are converted from the [official repo](https://github.com/baaivision/EVA). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{EVA,
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2211.07636},
year={2022}
}
```
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
out_type='avg_featmap',
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW', lr=4e-4, weight_decay=0.05, betas=(0.9, 0.999)),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
layer_decay_rate=0.65,
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
_base_ = [
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
out_type='cls_token',
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
neck=dict(type='ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]),
data_preprocessor=dict(
num_classes=1000,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=3.2, weight_decay=0.0, momentum=0.9),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=90,
by_epoch=True,
begin=10,
end=100,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)
_base_ = [
'../_base_/models/eva/eva-g.py',
'../_base_/datasets/imagenet_bs16_eva_336.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(img_size=336))
_base_ = [
'../_base_/models/eva/eva-g.py',
'../_base_/datasets/imagenet_bs16_eva_560.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(img_size=560))
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiTViT',
arch='eva-g',
img_size=224,
patch_size=14,
layer_scale_init_value=0.0,
out_type='avg_featmap',
use_abs_pos_emb=True,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiTViT',
arch='eva-g',
img_size=224,
patch_size=16,
layer_scale_init_value=0.0,
out_type='avg_featmap',
use_abs_pos_emb=True,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/models/eva/eva-l.py',
'../_base_/datasets/imagenet_bs16_eva_196.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(img_size=196))
_base_ = [
'../_base_/models/eva/eva-l.py',
'../_base_/datasets/imagenet_bs16_eva_336.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(img_size=336))
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiTViT',
arch='l',
img_size=224,
patch_size=14,
layer_scale_init_value=0.0,
out_type='avg_featmap',
use_abs_pos_emb=True,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
layer_cfgs=dict(bias=True),
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='EVA',
backbone=dict(init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='MAEPretrainDecoder',
predict_feature_dim=512,
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
head=dict(
_delete_=True,
type='MIMHead',
loss=dict(
type='CosineSimilarityLoss', shift_factor=2.0, scale_factor=2.0),
),
target_generator=dict(
type='CLIPGenerator',
tokenizer_path= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/clip_vit_base_16.pth.tar' # noqa
),
init_cfg=None)
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment