Commit 0fd8347d authored by unknown's avatar unknown
Browse files

添加mmclassification-0.24.1代码,删除mmclassification-speed-benchmark

parent cc567e9e
_base_ = [
'../_base_/models/convnext/convnext-tiny.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
data = dict(samples_per_gpu=128)
optimizer = dict(lr=4e-3)
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
_base_ = [
'../_base_/models/convnext/convnext-xlarge.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
data = dict(samples_per_gpu=64)
optimizer = dict(lr=4e-3)
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
Collections:
- Name: ConvNeXt
Metadata:
Training Data: ImageNet-1k
Architecture:
- 1x1 Convolution
- LayerScale
Paper:
URL: https://arxiv.org/abs/2201.03545v1
Title: A ConvNet for the 2020s
README: configs/convnext/README.md
Code:
Version: v0.20.1
URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py
Models:
- Name: convnext-tiny_3rdparty_32xb128_in1k
Metadata:
FLOPs: 4457472768
Parameters: 28589128
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.05
Top 5 Accuracy: 95.86
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth
Config: configs/convnext/convnext-tiny_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-tiny_3rdparty_32xb128-noema_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 4457472768
Parameters: 28589128
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.81
Top 5 Accuracy: 95.67
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128-noema_in1k_20220222-2908964a.pth
Config: configs/convnext/convnext-tiny_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-small_3rdparty_32xb128_in1k
Metadata:
FLOPs: 8687008512
Parameters: 50223688
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.13
Top 5 Accuracy: 96.44
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth
Config: configs/convnext/convnext-small_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-small_3rdparty_32xb128-noema_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 8687008512
Parameters: 50223688
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.11
Top 5 Accuracy: 96.34
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128-noema_in1k_20220222-fa001ca5.pth
Config: configs/convnext/convnext-small_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-base_3rdparty_32xb128_in1k
Metadata:
FLOPs: 15359124480
Parameters: 88591464
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.85
Top 5 Accuracy: 96.74
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth
Config: configs/convnext/convnext-base_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-base_3rdparty_32xb128-noema_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 15359124480
Parameters: 88591464
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.71
Top 5 Accuracy: 96.60
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth
Config: configs/convnext/convnext-base_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-base_3rdparty_in21k
Metadata:
Training Data: ImageNet-21k
FLOPs: 15359124480
Parameters: 88591464
In Collections: ConvNeXt
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-base_in21k-pre-3rdparty_32xb128_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 15359124480
Parameters: 88591464
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.81
Top 5 Accuracy: 97.86
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth
Config: configs/convnext/convnext-base_32xb128_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-large_3rdparty_64xb64_in1k
Metadata:
FLOPs: 34368026112
Parameters: 197767336
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 84.30
Top 5 Accuracy: 96.89
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth
Config: configs/convnext/convnext-large_64xb64_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-large_3rdparty_in21k
Metadata:
Training Data: ImageNet-21k
FLOPs: 34368026112
Parameters: 197767336
In Collections: ConvNeXt
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-large_in21k-pre-3rdparty_64xb64_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 34368026112
Parameters: 197767336
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 86.61
Top 5 Accuracy: 98.04
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth
Config: configs/convnext/convnext-large_64xb64_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-xlarge_3rdparty_in21k
Metadata:
Training Data: ImageNet-21k
FLOPs: 60929820672
Parameters: 350196968
In Collections: ConvNeXt
Results: null
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth
Code: https://github.com/facebookresearch/ConvNeXt
- Name: convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k
Metadata:
Training Data:
- ImageNet-21k
- ImageNet-1k
FLOPs: 60929820672
Parameters: 350196968
In Collections: ConvNeXt
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 86.97
Top 5 Accuracy: 98.20
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth
Config: configs/convnext/convnext-xlarge_64xb64_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth
Code: https://github.com/facebookresearch/ConvNeXt
# CSPNet
> [CSPNet: A New Backbone that can Enhance Learning Capability of CNN](https://arxiv.org/abs/1911.11929)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
Neural networks have enabled state-of-the-art approaches to achieve incredible results on computer vision tasks such as object detection. However, such success greatly relies on costly computation resources, which hinders people with cheap devices from appreciating the advanced technology. In this paper, we propose Cross Stage Partial Network (CSPNet) to mitigate the problem that previous works require heavy inference computations from the network architecture perspective. We attribute the problem to the duplicate gradient information within network optimization. The proposed networks respect the variability of the gradients by integrating feature maps from the beginning and the end of a network stage, which, in our experiments, reduces computations by 20% with equivalent or even superior accuracy on the ImageNet dataset, and significantly outperforms state-of-the-art approaches in terms of AP50 on the MS COCO object detection dataset. The CSPNet is easy to implement and general enough to cope with architectures based on ResNet, ResNeXt, and DenseNet. Source code is at this https URL.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/18586273/159420842-6147c687-a488-460c-8bb2-4ea5276c26c7.png" width="60%"/>
</div>
## Results and models
### ImageNet-1k
| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------: | :---------------------------------------------------------------------: |
| CSPDarkNet50\* | From scratch | 27.64 | 5.04 | 80.05 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspdarknet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth) |
| CSPResNet50\* | From scratch | 21.62 | 3.48 | 79.55 | 94.68 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth) |
| CSPResNeXt50\* | From scratch | 20.57 | 3.11 | 79.96 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnext50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth) |
*Models with * are converted from the [timm repo](https://github.com/rwightman/pytorch-image-models). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
## Citation
```bibtex
@inproceedings{wang2020cspnet,
title={CSPNet: A new backbone that can enhance learning capability of CNN},
author={Wang, Chien-Yao and Liao, Hong-Yuan Mark and Wu, Yueh-Hua and Chen, Ping-Yang and Hsieh, Jun-Wei and Yeh, I-Hau},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops},
pages={390--391},
year={2020}
}
```
_base_ = [
'../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(type='CSPDarkNet', depth=53),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
topk=(1, 5),
))
# dataset settings
dataset_type = 'ImageNet'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='ToTensor', keys=['gt_label']),
dict(type='Collect', keys=['img', 'gt_label'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
size=(288, -1),
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
]
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_prefix='data/imagenet/train',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline),
test=dict(
# replace `data/val` with `data/test` for standard test
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='accuracy')
_base_ = [
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(type='CSPResNet', depth=50),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
topk=(1, 5),
))
# dataset settings
dataset_type = 'ImageNet'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='ToTensor', keys=['gt_label']),
dict(type='Collect', keys=['img', 'gt_label'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
size=(288, -1),
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
]
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_prefix='data/imagenet/train',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline),
test=dict(
# replace `data/val` with `data/test` for standard test
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='accuracy')
_base_ = [
'../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(type='CSPResNeXt', depth=50),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=2048,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
topk=(1, 5),
))
# dataset settings
dataset_type = 'ImageNet'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='ToTensor', keys=['gt_label']),
dict(type='Collect', keys=['img', 'gt_label'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
size=(256, -1),
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
]
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_prefix='data/imagenet/train',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline),
test=dict(
# replace `data/val` with `data/test` for standard test
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='accuracy')
Collections:
- Name: CSPNet
Metadata:
Training Data: ImageNet-1k
Architecture:
- Cross Stage Partia Stage
Paper:
URL: https://arxiv.org/abs/1911.11929
Title: 'CSPNet: A New Backbone that can Enhance Learning Capability of CNN'
README: configs/cspnet/README.md
Code:
Version: v0.22.0
URL: https://github.com/open-mmlab/mmclassification/blob/v0.22.0/mmcls/models/backbones/cspnet.py
Models:
- Name: cspdarknet50_3rdparty_8xb32_in1k
Metadata:
FLOPs: 5040000000
Parameters: 27640000
In Collections: CSPNet
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 80.05
Top 5 Accuracy: 95.07
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth
Config: configs/cspnet/cspdarknet50_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspdarknet53_ra_256-d05c7c21.pth
Code: https://github.com/rwightman/pytorch-image-models
- Name: cspresnet50_3rdparty_8xb32_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 3480000000
Parameters: 21620000
In Collections: CSPNet
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 79.55
Top 5 Accuracy: 94.68
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth
Config: configs/cspnet/cspresnet50_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnet50_ra-d3e8d487.pth
Code: https://github.com/rwightman/pytorch-image-models
- Name: cspresnext50_3rdparty_8xb32_in1k
Metadata:
FLOPs: 3110000000
Parameters: 20570000
In Collections: CSPNet
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 79.96
Top 5 Accuracy: 94.96
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth
Config: configs/cspnet/cspresnext50_8xb32_in1k.py
Converted From:
Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnext50_ra_224-648b4713.pth
Code: https://github.com/rwightman/pytorch-image-models
# CSRA
> [Residual Attention: A Simple but Effective Method for Multi-Label Recognition](https://arxiv.org/abs/2108.02456)
<!-- [ALGORITHM] -->
## Abstract
Multi-label image recognition is a challenging computer vision task of practical use. Progresses in this area, however, are often characterized by complicated methods, heavy computations, and lack of intuitive explanations. To effectively capture different spatial regions occupied by objects from different categories, we propose an embarrassingly simple module, named class-specific residual attention (CSRA). CSRA generates class-specific features for every category by proposing a simple spatial attention score, and then combines it with the class-agnostic average pooling feature. CSRA achieves state-of-the-art results on multilabel recognition, and at the same time is much simpler than them. Furthermore, with only 4 lines of code, CSRA also leads to consistent improvement across many diverse pretrained models and datasets without any extra training. CSRA is both easy to implement and light in computations, which also enjoys intuitive explanations and visualizations.
<div align=center>
<img src="https://user-images.githubusercontent.com/84259897/176982245-3ffcff56-a4ea-4474-9967-bc2b612bbaa3.png" width="80%"/>
</div>
## Results and models
### VOC2007
| Model | Pretrain | Params(M) | Flops(G) | mAP | OF1 (%) | CF1 (%) | Config | Download |
| :------------: | :------------------------------------------------: | :-------: | :------: | :---: | :-----: | :-----: | :-----------------------------------------------: | :-------------------------------------------------: |
| Resnet101-CSRA | [ImageNet-1k](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth) | 23.55 | 4.12 | 94.98 | 90.80 | 89.16 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/csra/resnet101-csra_1xb16_voc07-448px.py) | [model](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.log.json) |
## Citation
```bibtex
@misc{https://doi.org/10.48550/arxiv.2108.02456,
doi = {10.48550/ARXIV.2108.02456},
url = {https://arxiv.org/abs/2108.02456},
author = {Zhu, Ke and Wu, Jianxin},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Residual Attention: A Simple but Effective Method for Multi-Label Recognition},
publisher = {arXiv},
year = {2021},
copyright = {arXiv.org perpetual, non-exclusive license}
}
```
Collections:
- Name: CSRA
Metadata:
Training Data: PASCAL VOC 2007
Architecture:
- Class-specific Residual Attention
Paper:
URL: https://arxiv.org/abs/1911.11929
Title: 'Residual Attention: A Simple but Effective Method for Multi-Label Recognition'
README: configs/csra/README.md
Code:
Version: v0.24.0
URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/heads/multi_label_csra_head.py
Models:
- Name: resnet101-csra_1xb16_voc07-448px
Metadata:
FLOPs: 4120000000
Parameters: 23550000
In Collections: CSRA
Results:
- Dataset: PASCAL VOC 2007
Metrics:
mAP: 94.98
OF1: 90.80
CF1: 89.16
Task: Multi-Label Classification
Weights: https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth
Config: configs/csra/resnet101-csra_1xb16_voc07-448px.py
_base_ = ['../_base_/datasets/voc_bs16.py', '../_base_/default_runtime.py']
# Pre-trained Checkpoint Path
checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth' # noqa
# If you want to use the pre-trained weight of ResNet101-CutMix from
# the originary repo(https://github.com/Kevinz-code/CSRA). Script of
# 'tools/convert_models/torchvision_to_mmcls.py' can help you convert weight
# into mmcls format. The mAP result would hit 95.5 by using the weight.
# checkpoint = 'PATH/TO/PRE-TRAINED_WEIGHT'
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(3, ),
style='pytorch',
init_cfg=dict(
type='Pretrained', checkpoint=checkpoint, prefix='backbone')),
neck=None,
head=dict(
type='CSRAClsHead',
num_classes=20,
in_channels=2048,
num_heads=1,
lam=0.1,
loss=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)))
# dataset setting
img_norm_cfg = dict(mean=[0, 0, 0], std=[255, 255, 255], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', size=448, scale=(0.7, 1.0)),
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='ToTensor', keys=['gt_label']),
dict(type='Collect', keys=['img', 'gt_label'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', size=448),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
]
data = dict(
# map the difficult examples as negative ones(0)
train=dict(pipeline=train_pipeline, difficult_as_postive=False),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
# the lr of classifier.head is 10 * base_lr, which help convergence.
optimizer = dict(
type='SGD',
lr=0.0002,
momentum=0.9,
weight_decay=0.0001,
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10)}))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
step=6,
gamma=0.1,
warmup='linear',
warmup_iters=1,
warmup_ratio=1e-7,
warmup_by_epoch=True)
runner = dict(type='EpochBasedRunner', max_epochs=20)
# DeiT
> [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
<!-- [ALGORITHM] -->
## Abstract
Recently, neural networks purely based on attention were shown to address image understanding tasks such as image classification. However, these visual transformers are pre-trained with hundreds of millions of images using an expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation token ensuring that the student learns from the teacher through attention. We show the interest of this token-based distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and models.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/143225703-c287c29e-82c9-4c85-a366-dfae30d198cd.png" width="40%"/>
</div>
## Results and models
### ImageNet-1k
The teacher of the distilled version DeiT is RegNetY-16GF.
| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------: | :--------------------------------------------------------------: |
| DeiT-tiny | From scratch | 5.72 | 1.08 | 74.50 | 92.24 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-tiny_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) |
| DeiT-tiny distilled\* | From scratch | 5.72 | 1.08 | 74.51 | 91.90 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) |
| DeiT-small | From scratch | 22.05 | 4.24 | 80.69 | 95.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-small_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) |
| DeiT-small distilled\* | From scratch | 22.05 | 4.24 | 81.17 | 95.40 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-small-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) |
| DeiT-base | From scratch | 86.57 | 16.86 | 81.76 | 95.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) |
| DeiT-base\* | From scratch | 86.57 | 16.86 | 81.79 | 95.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth) |
| DeiT-base distilled\* | From scratch | 86.57 | 16.86 | 83.33 | 96.49 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base-distilled_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) |
| DeiT-base 384px\* | ImageNet-1k | 86.86 | 49.37 | 83.04 | 96.31 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) |
| DeiT-base distilled 384px\* | ImageNet-1k | 86.86 | 49.37 | 85.55 | 97.35 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/deit). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
```{warning}
MMClassification doesn't support training the distilled version DeiT.
And we provide distilled version checkpoints for inference only.
```
## Citation
```
@InProceedings{pmlr-v139-touvron21a,
title = {Training data-efficient image transformers &amp; distillation through attention},
author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve},
booktitle = {International Conference on Machine Learning},
pages = {10347--10357},
year = {2021},
volume = {139},
month = {July}
}
```
_base_ = './deit-base_ft-16xb32_in1k-384px.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer'),
head=dict(type='DeiTClsHead'),
# Change to the path of the pretrained model
# init_cfg=dict(type='Pretrained', checkpoint=''),
)
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-base'),
head=dict(type='DeiTClsHead', in_channels=768),
)
# data settings
data = dict(samples_per_gpu=64, workers_per_gpu=5)
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_384.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='deit-base',
img_size=384,
patch_size=16,
),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
# Change to the path of the pretrained model
# init_cfg=dict(type='Pretrained', checkpoint=''),
)
# data settings
data = dict(samples_per_gpu=32, workers_per_gpu=5)
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(
type='VisionTransformer', arch='deit-base', drop_path_rate=0.1),
head=dict(type='VisionTransformerClsHead', in_channels=768),
)
# data settings
data = dict(samples_per_gpu=64, workers_per_gpu=5)
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-small'),
head=dict(type='DeiTClsHead', in_channels=384),
)
# In small and tiny arch, remove drop path and EMA hook comparing with the
# original config
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='deit-small',
img_size=224,
patch_size=16),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
]))
# data settings
data = dict(samples_per_gpu=256, workers_per_gpu=5)
paramwise_cfg = dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
})
optimizer = dict(paramwise_cfg=paramwise_cfg)
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-tiny'),
head=dict(type='DeiTClsHead', in_channels=192),
)
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='VisionTransformer', arch='deit-tiny'),
head=dict(type='VisionTransformerClsHead', in_channels=192),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment