Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
)
bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackInputs'),
]
train_dataloader = dict(
batch_size=128,
num_workers=16,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
val_dataloader = dict(
batch_size=64,
num_workers=8,
pin_memory=True,
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_dataloader = val_dataloader
model = dict(
backbone=dict(
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=5e-4 * (8 * 128 / 256),
betas=(0.9, 0.999),
weight_decay=0.05),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
layer_decay_rate=0.7,
custom_keys={
'.ln': dict(decay_mult=0.0), # do not decay on ln and bias
'.bias': dict(decay_mult=0.0)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-6,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
eta_min=1e-6,
by_epoch=True,
begin=5,
end=100,
convert_to_iter_based=True)
]
train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=10)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs256.py',
'../../_base_/default_runtime.py'
]
Collections:
- Name: MixMIM
Metadata:
Architecture:
- Attention Dropout
- Convolution
- Dense Connections
- Dropout
- GELU
- Layer Normalization
- Multi-Head Attention
- Scaled Dot-Product Attention
- Tanh Activation
Paper:
Title: 'MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation
Learning'
URL: https://arxiv.org/abs/2205.13137
README: configs/mixmim/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/mixmim.py
Version: v1.0.0rc4
Models:
- Name: mixmim_mixmim-base_16xb128-coslr-300e_in1k
Metadata:
Epochs: 300
Batch Size: 2048
FLOPs: 16351906816
Parameters: 114665784
Training Data: ImageNet-1k
In Collection: MixMIM
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.pth
Config: configs/mixmim/mixmim_mixmim-base_16xb128-coslr-300e_in1k.py
Downstream:
- mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
- Name: mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 1024
FLOPs: 16351906816
Parameters: 88344352
Training Data: ImageNet-1k
In Collection: MixMIM
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 84.63
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.pth
Config: configs/mixmim/benchmarks/mixmim-base_8xb128-coslr-100e_in1k.py
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
crop_ratio_range=(0.2, 1.0),
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5),
dict(type='PackInputs')
]
train_dataloader = dict(
batch_size=128,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# model settings
model = dict(
type='MixMIM',
backbone=dict(
type='MixMIMPretrainTransformer',
arch='B',
drop_rate=0.0,
drop_path_rate=0.0, # drop_path_rate=0.0 during pretraining
mask_ratio=0.5),
neck=dict(
type='MixMIMPretrainDecoder',
num_patches=49,
encoder_stride=32,
embed_dim=1024,
decoder_embed_dim=512,
decoder_depth=8,
decoder_num_heads=16),
head=dict(
type='MixMIMPretrainHead',
norm_pix=True,
loss=dict(type='PixelReconstructionLoss', criterion='L2')))
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * (2048 / 256),
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=1))
randomness = dict(seed=0, diff_rank_seed=True)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)
# MLP-Mixer
> [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601)
<!-- [ALGORITHM] -->
## Abstract
Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Recently, attention-based networks, such as the Vision Transformer, have also become popular. In this paper we show that while convolutions and attention are both sufficient for good performance, neither of them are necessary. We present MLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs). MLP-Mixer contains two types of layers: one with MLPs applied independently to image patches (i.e. "mixing" the per-location features), and one with MLPs applied across patches (i.e. "mixing" spatial information). When trained on large datasets, or with modern regularization schemes, MLP-Mixer attains competitive scores on image classification benchmarks, with pre-training and inference cost comparable to state-of-the-art models. We hope that these results spark further research beyond the realms of well established CNNs and Transformers.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/143178327-7118b48a-5f5f-4844-a614-a571917384ca.png" width="90%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mlp-mixer-base-p16_3rdparty_64xb64_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mlp-mixer-base-p16_3rdparty_64xb64_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------------: | :-------------------------------------------------------------: |
| `mlp-mixer-base-p16_3rdparty_64xb64_in1k`\* | From scratch | 59.88 | 12.61 | 76.68 | 92.25 | [config](mlp-mixer-base-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth) |
| `mlp-mixer-large-p16_3rdparty_64xb64_in1k`\* | From scratch | 208.20 | 44.57 | 72.34 | 88.02 | [config](mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) |
*Models with * are converted from the [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@misc{tolstikhin2021mlpmixer,
title={MLP-Mixer: An all-MLP Architecture for Vision},
author={Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy},
year={2021},
eprint={2105.01601},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
Collections:
- Name: MLP-Mixer
Metadata:
Training Data: ImageNet-1k
Architecture:
- MLP
- Layer Normalization
- Dropout
Paper:
URL: https://arxiv.org/abs/2105.01601
Title: "MLP-Mixer: An all-MLP Architecture for Vision"
README: configs/mlp_mixer/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.18.0/mmcls/models/backbones/mlp_mixer.py
Version: v0.18.0
Models:
- Name: mlp-mixer-base-p16_3rdparty_64xb64_in1k
In Collection: MLP-Mixer
Config: configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py
Metadata:
FLOPs: 12610000000 # 12.61 G
Parameters: 59880000 # 59.88 M
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 76.68
Top 5 Accuracy: 92.25
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224-76587d61.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py#L70
- Name: mlp-mixer-large-p16_3rdparty_64xb64_in1k
In Collection: MLP-Mixer
Config: configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py
Metadata:
FLOPs: 44570000000 # 44.57 G
Parameters: 208200000 # 208.2 M
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 72.34
Top 5 Accuracy: 88.02
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224_in21k-617b3de2.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py#L73
_base_ = [
'../_base_/models/mlp_mixer_base_patch16.py',
'../_base_/datasets/imagenet_bs64_mixer_224.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py',
]
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
_base_ = [
'../_base_/models/mlp_mixer_large_patch16.py',
'../_base_/datasets/imagenet_bs64_mixer_224.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py',
]
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
# MobileNet V2
> [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
<!-- [ALGORITHM] -->
## Introduction
**MobileNet V2** is initially described in [the paper](https://arxiv.org/pdf/1801.04381.pdf), which improves the state of the art performance of mobile models on multiple tasks. MobileNetV2 is an improvement on V1. Its new ideas include Linear Bottleneck and Inverted Residuals, and is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers. The intermediate expansion layer uses lightweight depthwise convolutions to filter features as a source of non-linearity. The author of MobileNet V2 measure its performance on Imagenet classification, COCO object detection, and VOC image segmentation.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/142563365-7a9ea577-8f79-4c21-a750-ebcaad9bcc2f.png" width="60%"/>
</div>
## Abstract
<details>
<summary>Show the paper's abstract</summary>
<br>
In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3.
The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters.
</br>
</details>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mobilenet-v2_8xb32_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mobilenet-v2_8xb32_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
```
Test:
```shell
python tools/test.py configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------: | :----------------------------------------------------------------------------------------: |
| `mobilenet-v2_8xb32_in1k` | From scratch | 3.50 | 0.32 | 71.86 | 90.42 | [config](mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.json) |
## Citation
```bibtex
@INPROCEEDINGS{8578572,
author={M. {Sandler} and A. {Howard} and M. {Zhu} and A. {Zhmoginov} and L. {Chen}},
booktitle={2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
title={MobileNetV2: Inverted Residuals and Linear Bottlenecks},
year={2018},
volume={},
number={},
pages={4510-4520},
doi={10.1109/CVPR.2018.00474}}
}
```
Collections:
- Name: MobileNet V2
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- SGD with Momentum
- Weight Decay
Training Resources: 8x V100 GPUs
Epochs: 300
Batch Size: 256
Architecture:
- MobileNet V2
Paper:
URL: https://arxiv.org/abs/1801.04381
Title: "MobileNetV2: Inverted Residuals and Linear Bottlenecks"
README: configs/mobilenet_v2/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.15.0/mmcls/models/backbones/mobilenet_v2.py#L101
Version: v0.15.0
Models:
- Name: mobilenet-v2_8xb32_in1k
Metadata:
FLOPs: 319000000
Parameters: 3500000
In Collection: MobileNet V2
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 71.86
Top 5 Accuracy: 90.42
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth
Config: configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
_base_ = [
'../_base_/models/mobilenet_v2_1x.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_epochstep.py',
'../_base_/default_runtime.py'
]
# MobileNet V3
> [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)
<!-- [ALGORITHM] -->
## Introduction
**MobileNet V3** is initially described in [the paper](https://arxiv.org/pdf/1905.02244.pdf). MobileNetV3 parameters are obtained by NAS (network architecture search) search, and some practical results of V1 and V2 are inherited, and the attention mechanism of SE channel is attracted, which can be considered as a masterpiece. The author create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. The author of MobileNet V3 measure its performance on Imagenet classification, COCO object detection, and Cityscapes segmentation.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/142563801-ef4feacc-ecd7-4d14-a411-8c9d63571749.png" width="60%"/>
</div>
## Abstract
<details>
<summary>Show the paper's abstract</summary>
<br>
We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2% more accurate on ImageNet classification while reducing latency by 15% compared to MobileNetV2. MobileNetV3-Small is 4.6% more accurate while reducing latency by 5% compared to MobileNetV2. MobileNetV3-Large detection is 25% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 30% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.
</br>
</details>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mobilenet-v3-small-050_3rdparty_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mobilenet-v3-small-050_3rdparty_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mobilenet_v3/mobilenet-v3-small_8xb128_in1k.py
```
Test:
```shell
python tools/test.py configs/mobilenet_v3/mobilenet-v3-small-050_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small-050_3rdparty_in1k_20221114-e0b86be1.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :--------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :---------------------------------------------: | :--------------------------------------------------------------: |
| `mobilenet-v3-small-050_3rdparty_in1k`\* | From scratch | 1.59 | 0.02 | 57.91 | 80.19 | [config](mobilenet-v3-small-050_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small-050_3rdparty_in1k_20221114-e0b86be1.pth) |
| `mobilenet-v3-small-075_3rdparty_in1k`\* | From scratch | 2.04 | 0.04 | 65.23 | 85.44 | [config](mobilenet-v3-small-075_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small-075_3rdparty_in1k_20221114-2011fa76.pth) |
| `mobilenet-v3-small_8xb128_in1k` | From scratch | 2.54 | 0.06 | 66.68 | 86.74 | [config](mobilenet-v3-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small_8xb128_in1k_20221114-bd1bfcde.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small_8xb128_in1k_20221114-bd1bfcde.json) |
| `mobilenet-v3-small_3rdparty_in1k`\* | From scratch | 2.54 | 0.06 | 67.66 | 87.41 | [config](mobilenet-v3-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth) |
| `mobilenet-v3-large_8xb128_in1k` | From scratch | 5.48 | 0.23 | 73.49 | 91.31 | [config](mobilenet-v3-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-large_8xb128_in1k_20221114-0ed9ed9a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-large_8xb128_in1k_20221114-0ed9ed9a.json) |
| `mobilenet-v3-large_3rdparty_in1k`\* | From scratch | 5.48 | 0.23 | 74.04 | 91.34 | [config](mobilenet-v3-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth) |
*Models with * are converted from the [official repo](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@inproceedings{Howard_2019_ICCV,
author = {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig},
title = {Searching for MobileNetV3},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
month = {October},
year = {2019}
}
```
Collections:
- Name: MobileNet V3
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- RMSprop with Momentum
- Weight Decay
Training Resources: 8x V100 GPUs
Epochs: 600
Batch Size: 1024
Architecture:
- MobileNet V3
Paper:
URL: https://arxiv.org/abs/1905.02244
Title: Searching for MobileNetV3
README: configs/mobilenet_v3/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.15.0/mmcls/models/backbones/mobilenet_v3.py
Version: v0.15.0
Models:
- Name: mobilenet-v3-small-050_3rdparty_in1k
Metadata:
FLOPs: 24895000
Parameters: 1590000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 57.91
Top 5 Accuracy: 80.19
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small-050_3rdparty_in1k_20221114-e0b86be1.pth
Config: configs/mobilenet_v3/mobilenet-v3-small-050_8xb128_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_050_lambc-4b7bbe87.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/mobilenetv3.py
- Name: mobilenet-v3-small-075_3rdparty_in1k
Metadata:
FLOPs: 44791000
Parameters: 2040000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 65.23
Top 5 Accuracy: 85.44
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small-075_3rdparty_in1k_20221114-2011fa76.pth
Config: configs/mobilenet_v3/mobilenet-v3-small-075_8xb128_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_075_lambc-384766db.pth
Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/mobilenetv3.py
- Name: mobilenet-v3-small_8xb128_in1k
Metadata:
FLOPs: 60000000
Parameters: 2540000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 66.68
Top 5 Accuracy: 86.74
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-small_8xb128_in1k_20221114-bd1bfcde.pth
Config: configs/mobilenet_v3/mobilenet-v3-small_8xb128_in1k.py
- Name: mobilenet-v3-small_3rdparty_in1k
Metadata:
FLOPs: 60000000
Parameters: 2540000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 67.66
Top 5 Accuracy: 87.41
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth
Config: configs/mobilenet_v3/mobilenet-v3-small_8xb128_in1k.py
Converted From:
Weights: https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth
Code: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py
- Name: mobilenet-v3-large_8xb128_in1k
Metadata:
FLOPs: 230000000
Parameters: 5480000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 73.49
Top 5 Accuracy: 91.31
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/mobilenet-v3-large_8xb128_in1k_20221114-0ed9ed9a.pth
Config: configs/mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py
- Name: mobilenet-v3-large_3rdparty_in1k
Metadata:
FLOPs: 230000000
Parameters: 5480000
In Collection: MobileNet V3
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 74.04
Top 5 Accuracy: 91.34
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth
Config: configs/mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py
Converted From:
Weights: https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth
Code: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py
# Refers to https://pytorch.org/blog/ml-models-torchvision-v0.9/#classification
_base_ = [
'../_base_/models/mobilenet_v3/mobilenet_v3_large_imagenet.py',
'../_base_/datasets/imagenet_bs128_mbv3.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(
type='RMSprop',
lr=0.064,
alpha=0.9,
momentum=0.9,
eps=0.0316,
weight_decay=1e-5))
param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
val_cfg = dict()
test_cfg = dict()
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (8 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=1024)
_base_ = [
'../_base_/models/mobilenet_v3/mobilenet_v3_small_050_imagenet.py',
'../_base_/datasets/imagenet_bs128_mbv3.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(backbone=dict(norm_cfg=dict(type='BN', eps=1e-5, momentum=0.1)))
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='AutoAugment',
policies='imagenet',
hparams=dict(pad_val=[round(x) for x in [103.53, 116.28, 123.675]])),
dict(
type='RandomErasing',
erase_prob=0.2,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
# schedule settings
optim_wrapper = dict(
optimizer=dict(
type='RMSprop',
lr=0.064,
alpha=0.9,
momentum=0.9,
eps=0.0316,
weight_decay=1e-5))
param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=10)
val_cfg = dict()
test_cfg = dict()
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (8 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=1024)
_base_ = [
'../_base_/models/mobilenet_v3/mobilenet_v3_small_075_imagenet.py',
'../_base_/datasets/imagenet_bs128_mbv3.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(backbone=dict(norm_cfg=dict(type='BN', eps=1e-5, momentum=0.1)))
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='AutoAugment',
policies='imagenet',
hparams=dict(pad_val=[round(x) for x in [103.53, 116.28, 123.675]])),
dict(
type='RandomErasing',
erase_prob=0.2,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# schedule settings
optim_wrapper = dict(
optimizer=dict(
type='RMSprop',
lr=0.064,
alpha=0.9,
momentum=0.9,
eps=0.0316,
weight_decay=1e-5))
param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=10)
val_cfg = dict()
test_cfg = dict()
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (8 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=1024)
# Refers to https://pytorch.org/blog/ml-models-torchvision-v0.9/#classification
_base_ = [
'../_base_/models/mobilenet_v3/mobilenet_v3_small_imagenet.py',
'../_base_/datasets/imagenet_bs128_mbv3.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(
type='RMSprop',
lr=0.064,
alpha=0.9,
momentum=0.9,
eps=0.0316,
weight_decay=1e-5))
param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
val_cfg = dict()
test_cfg = dict()
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (8 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=1024)
_base_ = [
'../_base_/models/mobilenet_v3/mobilenet_v3_small_cifar.py',
'../_base_/datasets/cifar10_bs16.py',
'../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
]
# schedule settings
param_scheduler = dict(
type='MultiStepLR',
by_epoch=True,
milestones=[120, 170],
gamma=0.1,
)
train_cfg = dict(by_epoch=True, max_epochs=200)
# MobileOne
> [An Improved One millisecond Mobile Backbone](https://arxiv.org/abs/2206.04040)
<!-- [ALGORITHM] -->
## Introduction
Mobileone is proposed by apple and based on reparameterization. On the apple chips, the accuracy of the model is close to 0.76 on the ImageNet dataset when the latency is less than 1ms. Its main improvements based on [RepVGG](../repvgg) are fllowing:
- Reparameterization using Depthwise convolution and Pointwise convolution instead of normal convolution.
- Removal of the residual structure which is not friendly to access memory.
<div align=center>
<img src="https://user-images.githubusercontent.com/18586273/183552452-74657532-f461-48f7-9aa7-c23f006cdb07.png" width="40%"/>
</div>
## Abstract
<details>
<summary>Show the paper's abstract</summary>
<br>
Efficient neural network backbones for mobile devices are often optimized for metrics such as FLOPs or parameter count. However, these metrics may not correlate well with latency of the network when deployed on a mobile device. Therefore, we perform extensive analysis of different metrics by deploying several mobile-friendly networks on a mobile device. We identify and analyze architectural and optimization bottlenecks in recent efficient neural networks and provide ways to mitigate these bottlenecks. To this end, we design an efficient backbone MobileOne, with variants achieving an inference time under 1 ms on an iPhone12 with 75.9% top-1 accuracy on ImageNet. We show that MobileOne achieves state-of-the-art performance within the efficient architectures while being many times faster on mobile. Our best model obtains similar performance on ImageNet as MobileFormer while being 38x faster. Our model obtains 2.3% better top-1 accuracy on ImageNet than EfficientNet at similar latency. Furthermore, we show that our model generalizes to multiple tasks - image classification, object detection, and semantic segmentation with significant improvements in latency and accuracy as compared to existing efficient architectures when deployed on a mobile device.
</br>
</details>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mobileone-s0_8xb32_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mobileone-s0_8xb32_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mobileone/mobileone-s0_8xb32_in1k.py
```
Test:
```shell
python tools/test.py configs/mobileone/mobileone-s0_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------: | :----------------------------------------------------------------------------------------: |
| `mobileone-s0_8xb32_in1k` | From scratch | 2.08 | 0.27 | 71.34 | 89.87 | [config](mobileone-s0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.json) |
| `mobileone-s1_8xb32_in1k` | From scratch | 4.76 | 0.82 | 75.72 | 92.54 | [config](mobileone-s1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_8xb32_in1k_20221110-ceeef467.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_8xb32_in1k_20221110-ceeef467.json) |
| `mobileone-s2_8xb32_in1k` | From scratch | 7.81 | 1.30 | 77.37 | 93.34 | [config](mobileone-s2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_8xb32_in1k_20221110-9c7ecb97.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_8xb32_in1k_20221110-9c7ecb97.json) |
| `mobileone-s3_8xb32_in1k` | From scratch | 10.08 | 1.89 | 78.06 | 93.83 | [config](mobileone-s3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_8xb32_in1k_20221110-c95eb3bf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_8xb32_in1k_20221110-c95eb3bf.json) |
| `mobileone-s4_8xb32_in1k` | From scratch | 14.84 | 2.98 | 79.69 | 94.46 | [config](mobileone-s4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.json) |
## Citation
```bibtex
@article{mobileone2022,
title={An Improved One millisecond Mobile Backbone},
author={Vasu, Pavan Kumar Anasosalu and Gabriel, James and Zhu, Jeff and Tuzel, Oncel and Ranjan, Anurag},
journal={arXiv preprint arXiv:2206.04040},
year={2022}
}
```
_base_ = ['../mobileone-s0_8xb32_in1k.py']
model = dict(backbone=dict(deploy=True))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment