Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='b',
img_size=224,
patch_size=14,
sub_ln=True,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/datasets/imagenet_bs16_eva_448.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='b',
img_size=448,
patch_size=14,
sub_ln=True,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='l',
img_size=224,
patch_size=14,
sub_ln=True,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/datasets/imagenet_bs16_eva_448.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='l',
img_size=448,
patch_size=14,
sub_ln=True,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='s',
img_size=224,
patch_size=14,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/datasets/imagenet_bs16_eva_336.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='s',
img_size=336,
patch_size=14,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='t',
img_size=224,
patch_size=14,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
# convert image from BGR to RGB
to_rgb=True,
)
_base_ = [
'../_base_/datasets/imagenet_bs16_eva_336.py',
'../_base_/schedules/imagenet_bs2048_AdamW.py',
'../_base_/default_runtime.py'
]
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTEVA02',
arch='t',
img_size=336,
patch_size=14,
final_norm=False,
out_type='avg_featmap'),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=192,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
Collections:
- Name: EVA02
Metadata:
Architecture:
- Rotary Position Embedding
- Sub Layer Normalization
- SwiGLU
Paper:
Title: 'EVA-02: A Visual Representation for Neon Genesis'
URL: https://arxiv.org/abs/2303.11331
README: configs/eva02/README.md
Models:
- Name: vit-tiny-p14_eva02-pre_in21k
Metadata:
FLOPs: 1703439360
Parameters: 5504064
Training Data:
- ImageNet-21k
In Collection: EVA02
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-tiny-p14_pre_in21k_20230505-d703e7b1.pth
Config: configs/eva02/eva02-tiny-p14_headless.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_Ti_pt_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
Downstream:
- vit-tiny-p14_eva02-in21k-pre_3rdparty_in1k-336px
- Name: vit-tiny-p14_eva02-in21k-pre_3rdparty_in1k-336px
Metadata:
FLOPs: 4675416000
Parameters: 5758888
Training Data:
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 80.69
Top 5 Accuracy: 95.54
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-tiny-p14_in21k-pre_3rdparty_in1k-336px_20230505-a4e8708a.pth
Config: configs/eva02/eva02-tiny-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_Ti_pt_in21k_ft_in1k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
- Name: vit-small-p14_eva02-pre_in21k
Metadata:
FLOPs: 6135404544
Parameters: 21624960
Training Data:
- ImageNet-21k
In Collection: EVA02
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-small-p14_pre_in21k_20230505-3175f463.pth
Config: configs/eva02/eva02-small-p14_headless.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_S_pt_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
Downstream:
- vit-small-p14_eva02-in21k-pre_3rdparty_in1k-336px
- Name: vit-small-p14_eva02-in21k-pre_3rdparty_in1k-336px
Metadata:
FLOPs: 15476744064
Parameters: 22133608
Training Data:
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 85.78
Top 5 Accuracy: 97.60
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-small-p14_in21k-pre_3rdparty_in1k-336px_20230505-9c5b0e85.pth
Config: configs/eva02/eva02-small-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_S_pt_in21k_ft_in1k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
- Name: vit-base-p14_eva02-pre_in21k
Metadata:
FLOPs: 23216492544
Parameters: 85766400
Training Data:
- ImageNet-21k
In Collection: EVA02
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_pre_in21k_20230505-2f2d4d3c.pth
Config: configs/eva02/eva02-base-p14_headless.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_B_pt_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
Downstream:
- vit-base-p14_eva02-in21k-pre_3rdparty_in1k-448px
- vit-base-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
- Name: vit-base-p14_eva02-in21k-pre_3rdparty_in1k-448px
Metadata:
FLOPs: 107105984256
Parameters: 87126760
Training Data:
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 88.29
Top 5 Accuracy: 98.53
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_in21k-pre_3rdparty_in1k-448px_20230505-8ad211c5.pth
Config: configs/eva02/eva02-base-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_B_pt_in21k_ft_in1k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
- Name: vit-base-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
Metadata:
FLOPs: 107105984256
Parameters: 87126760
Training Data:
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 88.47
Top 5 Accuracy: 98.62
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_in21k-pre_in21k-medft_3rdparty_in1k-448px_20230505-5cd4d87f.pth
Config: configs/eva02/eva02-base-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_B_pt_in21k_medft_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
- Name: vit-large-p14_eva02-pre_in21k
Metadata:
FLOPs: 81146703792
Parameters: 303291328
Training Data:
- ImageNet-21k
In Collection: EVA02
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_pre_in21k_20230505-9072de5d.pth
Config: configs/eva02/eva02-large-p14_headless.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_L_pt_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
Downstream:
- vit-large-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
- Name: vit-large-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
Metadata:
FLOPs: 362333836208
Parameters: 305104808
Training Data:
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 89.65
Top 5 Accuracy: 98.95
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_in21k-pre_in21k-medft_3rdparty_in1k-448px_20230505-926d1599.pth
Config: configs/eva02/eva02-large-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_L_pt_in21k_medft_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
- Name: vit-large-p14_eva02-pre_m38m
Metadata:
FLOPs: 81146703792
Parameters: 303291328
Training Data:
- Merged-38M
In Collection: EVA02
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_pre_m38m_20230505-b8a1a261.pth
Config: configs/eva02/eva02-large-p14_headless.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_L_pt_m38m_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
Downstream:
- vit-large-p14_eva02_m38m-pre_in21k-medft_3rdparty_in1k-448px
- Name: vit-large-p14_eva02_m38m-pre_in21k-medft_3rdparty_in1k-448px
Metadata:
FLOPs: 362333836208
Parameters: 305104808
Training Data:
- Merged-38M
- ImageNet-21k
- ImageNet-1k
In Collection: EVA02
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 89.83
Top 5 Accuracy: 99.00
Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_m38m-pre_in21k-medft_3rdparty_in1k-448px_20230505-150dc5ed.pth
Config: configs/eva02/eva02-large-p14_in1k.py
Converted From:
Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_L_pt_m38m_medft_in21k_p14.pt
Code: https://github.com/baaivision/EVA/tree/master/EVA-02
# Flamingo
> [Flamingo: a Visual Language Model for Few-Shot Learning](https://arxiv.org/abs/2204.14198)
<!-- [ALGORITHM] -->
## Abstract
Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/236371424-3b9d2e16-3966-4c64-8b87-e33fd6348824.png" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
from mmpretrain import inference_model
result = inference_model('flamingo_3rdparty-zeroshot_caption', 'demo/cat-dog.png')
print(result)
# {'pred_caption': 'A dog and a cat are looking at each other. '}
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/flamingo/flamingo_zeroshot_caption.py https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Caption on COCO
| Model | Params (G) | CIDER | Config | Download |
| :------------------------------------- | :--------: | :---: | :------------------------------------: | :-----------------------------------------------------------------------------------------------------------: |
| `flamingo_3rdparty-zeroshot_caption`\* | 8.220 | 65.50 | [config](flamingo_zeroshot_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth) |
*Models with * are converted from the [openflamingo](https://github.com/mlfoundations/open_flamingo). The config files of these models are only for inference. We haven't reproduce the training results.*
### Visual Question Answering on VQAv2
| Model | Params (G) | Accuracy | Config | Download |
| :--------------------------------- | :--------: | :------: | :--------------------------------: | :----------------------------------------------------------------------------------------------------------------: |
| `flamingo_3rdparty-zeroshot_vqa`\* | 8.22 | 43.50 | [config](flamingo_zeroshot_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth) |
*Models with * are converted from the [openflamingo](https://github.com/mlfoundations/open_flamingo). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{Alayrac2022FlamingoAV,
title={Flamingo: a Visual Language Model for Few-Shot Learning},
author={Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katie Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andy Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
journal={ArXiv},
year={2022},
volume={abs/2204.14198}
}
```
```bibtex
@software{anas_awadalla_2023_7733589,
author = {Awadalla, Anas and Gao, Irena and Gardner, Joshua and Hessel, Jack and Hanafy, Yusuf and Zhu, Wanrong and Marathe, Kalyani and Bitton, Yonatan and Gadre, Samir and Jitsev, Jenia and Kornblith, Simon and Koh, Pang Wei and Ilharco, Gabriel and Wortsman, Mitchell and Schmidt, Ludwig},
title = {OpenFlamingo},
month = mar,
year = 2023,
publisher = {Zenodo},
version = {v0.1.1},
doi = {10.5281/zenodo.7733589},
url = {https://doi.org/10.5281/zenodo.7733589}
}
```
_base_ = [
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='Flamingo',
tokenizer=dict(
type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='decapoda-research/llama-7b-hf',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False),
),
task='caption',
shot_prompt_tmpl='<image>Output:{caption}<|endofchunk|>',
final_prompt_tmpl='<image>Output:',
generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0))
# data settings
data_preprocessor = dict(
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(
type='ApplyToList',
# Flamingo requires to load multiple images during few-shot inference.
scatter_key='img_path',
transforms=[
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
],
collate_keys=['img', 'scale_factor', 'ori_shape'],
),
dict(
type='PackInputs',
algorithm_keys=['gt_caption', 'shots'],
meta_keys=['image_id']),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOCaption',
data_root='data/coco',
ann_file='annotations/captions_train2014.json',
data_prefix=dict(img_path='train2014'),
pipeline=test_pipeline,
num_shots=2,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(
type='COCOCaption',
ann_file='data/coco/annotations/captions_train2014.json')
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
# schedule settings
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='Flamingo',
tokenizer=dict(
type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='decapoda-research/llama-7b-hf',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False),
),
task='vqa',
shot_prompt_tmpl=
'<image>Question:{question} Short Answer:{answer}<|endofchunk|>',
final_prompt_tmpl='<image>Question:{question} Short Answer:',
generation_cfg=dict(num_beams=3, max_new_tokens=5, length_penalty=-2.0))
# data settings
data_preprocessor = dict(
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(
type='ApplyToList',
# Flamingo requires to load multiple images during few-shot inference.
scatter_key='img_path',
transforms=[
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
],
collate_keys=['img', 'scale_factor', 'ori_shape'],
),
dict(
type='PackInputs',
algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
meta_keys=['image_id']),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='val2014',
question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
ann_file='annotations/v2_mscoco_val2014_annotations.json',
pipeline=test_pipeline,
num_shots=2,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='VQAAcc')
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='test2015',
question_file=
'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
# schedule settings
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/default_runtime.py',
]
zeroshot_prompt = (
'Output:A child holding a flowered umbrella and petting a yak.<|endofchunk|>' # noqa: E501
'Output:The child is holding a brush close to his mouth.<|endofchunk|>' # noqa: E501
)
# model settings
model = dict(
type='Flamingo',
tokenizer=dict(
type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='decapoda-research/llama-7b-hf',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False),
),
task='caption',
zeroshot_prompt=zeroshot_prompt,
final_prompt_tmpl='<image>Output:',
generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
dict(
type='PackInputs',
algorithm_keys=['gt_caption'],
meta_keys=['image_id'],
),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOCaption',
data_root='data/coco',
ann_file='annotations/captions_train2014.json',
data_prefix=dict(img_path='train2014'),
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(
type='COCOCaption',
ann_file='data/coco/annotations/captions_train2014.json')
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
# schedule settings
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/default_runtime.py',
]
zeroshot_prompt = (
'Question:What is this photo taken looking through? Short Answer:pitcher<|endofchunk|>' # noqa: E501
'Question:How many people are wearing shorts in the forefront of this photo? Short Answer:4<|endofchunk|>' # noqa: E501
)
# model settings
model = dict(
type='Flamingo',
tokenizer=dict(
type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='decapoda-research/llama-7b-hf',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False),
),
task='vqa',
zeroshot_prompt=zeroshot_prompt,
final_prompt_tmpl='<image>Question:{question} Short Answer:',
generation_cfg=dict(num_beams=3, max_new_tokens=5, length_penalty=-2.0))
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
dict(
type='PackInputs',
algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
meta_keys=['image_id'],
),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='val2014',
question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
ann_file='annotations/v2_mscoco_val2014_annotations.json',
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='VQAAcc')
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='test2015',
question_file=
'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
# schedule settings
val_cfg = dict()
test_cfg = dict()
Collections:
- Name: Flamingo
Metadata:
Architecture:
- Transformer
- Gated Cross-Attention Dense
Paper:
Title: 'Flamingo: a Visual Language Model for Few-Shot Learning'
URL: https://arxiv.org/abs/2204.14198
README: configs/flamingo/README.md
Models:
- Name: flamingo_3rdparty-zeroshot_caption
Metadata:
FLOPs: null
Parameters: 8220452880
In Collection: Flamingo
Results:
- Task: Image Caption
Dataset: COCO
Metrics:
CIDER: 65.50 # Report from the official repo
Weights: https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
Config: configs/flamingo/flamingo_zeroshot_caption.py
Converted From:
Weights: https://huggingface.co/openflamingo/OpenFlamingo-9B
Code: https://github.com/mlfoundations/open_flamingo
- Name: flamingo_3rdparty-zeroshot_vqa
Metadata:
FLOPs: null
Parameters: 8220452880
In Collection: Flamingo
Results:
- Task: Visual Question Answering
Dataset: VQAv2
Metrics:
Accuracy: 43.50 # Report from the official repo
Weights: https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
Config: configs/flamingo/flamingo_zeroshot_vqa.py
Converted From:
Weights: https://huggingface.co/openflamingo/OpenFlamingo-9B
Code: https://github.com/mlfoundations/open_flamingo
# GLIP
> [Grounded Language-Image Pre-training](https://arxiv.org/abs/2112.03857)
<!-- [ALGORITHM] -->
## Abstract
This paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich. In our experiments, we pre-train GLIP on 27M grounding data, including 3M human-annotated and 24M web-crawled image-text pairs. The learned representations demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks. 1) When directly evaluated on COCO and LVIS (without seeing any images in COCO during pre-training), GLIP achieves 49.8 AP and 26.9 AP, respectively, surpassing many supervised baselines. 2) After fine-tuned on COCO, GLIP achieves 60.8 AP on val and 61.5 AP on test-dev, surpassing prior SoTA. 3) When transferred to 13 downstream object detection tasks, a 1-shot GLIP rivals with a fully-supervised Dynamic Head.
<div align="center">
<img src="https://github.com/microsoft/GLIP/blob/main/docs/lead.png" width="70%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('swin-t_glip-pre_3rdparty', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
<!-- [TABS-END] -->
## Results and models
### Pre-trained models
The pre-trained models are used to fine-tune, and therefore don't have evaluation results.
| Model | Pretrain | resolution | Download |
| :------------------------------------------ | :------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------: |
| GLIP-T (`swin-t_glip-pre_3rdparty`)\* | O365,GoldG,CC3M,SBU | 224x224 | [model](https://download.openmmlab.com/mmclassification/v1/glip/swin-t_glip-pre_3rdparty_20230413-d85813b5.pth) |
| GLIP-L (`swin-l_glip-pre_3rdparty_384px`)\* | FourODs,GoldG,CC3M+12M,SBU | 384x384 | [model](https://download.openmmlab.com/mmclassification/v1/glip/swin-l_glip-pre_3rdparty_384px_20230413-04b198e8.pth) |
*Models with * are converted from the [official repo](https://github.com/microsoft/GLIP).*
## Citation
```bibtex
@inproceedings{li2021grounded,
title={Grounded Language-Image Pre-training},
author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
year={2022},
booktitle={CVPR},
}
```
model = dict(
type='ImageClassifier',
backbone=dict(
type='SwinTransformer',
arch='large',
img_size=384,
out_indices=(1, 2, 3), # original weight is for detection
stage_cfgs=dict(block_cfgs=dict(window_size=12))),
neck=None,
head=None)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[103.53, 116.28, 123.675],
std=[57.375, 57.12, 58.395],
# convert image from BGR to RGB
to_rgb=False,
)
model = dict(
type='ImageClassifier',
backbone=dict(
type='SwinTransformer',
arch='tiny',
img_size=224,
out_indices=(1, 2, 3), # original weight is for detection
),
neck=None,
head=None)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[103.53, 116.28, 123.675],
std=[57.375, 57.12, 58.395],
# convert image from BGR to RGB
to_rgb=False,
)
Collections:
- Name: GLIP
Metadata:
Training Techniques:
- AdamW
- Weight Decay
Architecture:
- Shift Window Multihead Self Attention
Paper:
URL: https://arxiv.org/abs/2112.03857
Title: "Grounded Language-Image Pre-training"
README: configs/glip/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/vit.py
Version: v1.0.0rc8
Models:
- Name: swin-t_glip-pre_3rdparty
In Collection: GLIP
Metadata:
FLOPs: 4508464128
Parameters: 29056354
Training Data:
- O365
- GoldG
- CC3M
- SBU
Results: null
Weights: https://download.openmmlab.com/mmclassification/v1/glip/swin-t_glip-pre_3rdparty_20230413-d85813b5.pth
Converted From:
Weights: https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg_cc_sbu.pth
Code: https://github.com/microsoft/GLIP
Config: configs/glip/glip-t_headless.py
- Name: swin-l_glip-pre_3rdparty_384px
In Collection: GLIP
Metadata:
FLOPs: 104080343040
Parameters: 196735516
Training Data:
- FourODs
- GoldG
- CC3M+12M
- SBU
Results: null
Weights: https://download.openmmlab.com/mmclassification/v1/glip/swin-l_glip-pre_3rdparty_384px_20230413-04b198e8.pth
Converted From:
Weights: https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth
Code: https://github.com/microsoft/GLIP
Config: configs/glip/glip-l_headless.py
# HiViT
> [HiViT: A Simple and More Efficient Design of Hierarchical Vision Transformer](https://arxiv.org/abs/2205.14949)
<!-- [ALGORITHM] -->
## Abstract
Recently, masked image modeling (MIM) has offered a new methodology of self-supervised pre-training of vision transformers. A key idea of efficient implementation is to discard the masked image patches (or tokens) throughout the target network (encoder), which requires the encoder to be a plain vision transformer (e.g., ViT), albeit hierarchical vision transformers (e.g., Swin Transformer) have potentially better properties in formulating vision inputs. In this paper, we offer a new design of hierarchical vision transformers named HiViT (short for Hierarchical ViT) that enjoys both high efficiency and good performance in MIM. The key is to remove the unnecessary "local inter-unit operations", deriving structurally simple hierarchical vision transformers in which mask-units can be serialized like plain vision transformers. For this purpose, we start with Swin Transformer and (i) set the masking unit size to be the token size in the main stage of Swin Transformer, (ii) switch off inter-unit self-attentions before the main stage, and (iii) eliminate all operations after the main stage. Empirical studies demonstrate the advantageous performance of HiViT in terms of fully-supervised, self-supervised, and transfer learning. In particular, in running MAE on ImageNet-1K, HiViT-B reports a +0.6% accuracy gain over ViT-B and a 1.9$\times$ speed-up over Swin-B, and the performance gain generalizes to downstream tasks of detection and segmentation. Code will be made publicly available.
<div align=center>
<img src="https://github.com/open-mmlab/mmpretrain/assets/36138628/4a99cf9d-15df-4866-8750-bd2c3db5d894" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
<!-- **Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('hivit-tiny-p16_16xb64_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
<!-- **Use the model** -->
<!-- ```python
import torch
from mmpretrain import get_model
model = get_model('hivit-tiny-p16_16xb64_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
``` -->
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/hivit/hivit-tiny-p16_16xb64_in1k.py
```
<!-- Test:
```shell
python tools/test.py configs/hivit/hivit-tiny-p16_16xb64_in1k.py None
``` -->
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Config | Download |
| :---------------------------- | :----------: | :--------: | :-------: | :-------: | :--------------------------------------: | :------: |
| `hivit-tiny-p16_16xb64_in1k` | From scratch | 19.18 | 4.60 | 82.10 | [config](hivit-tiny-p16_16xb64_in1k.py) | N/A |
| `hivit-small-p16_16xb64_in1k` | From scratch | 37.53 | 9.07 | N/A | [config](hivit-small-p16_16xb64_in1k.py) | N/A |
| `hivit-base-p16_16xb64_in1k` | From scratch | 79.05 | 18.47 | N/A | [config](hivit-base-p16_16xb64_in1k.py) | N/A |
## Citation
```bibtex
@inproceedings{zhanghivit,
title={HiViT: A Simpler and More Efficient Design of Hierarchical Vision Transformer},
author={Zhang, Xiaosong and Tian, Yunjie and Xie, Lingxi and Huang, Wei and Dai, Qi and Ye, Qixiang and Tian, Qi},
booktitle={International Conference on Learning Representations},
year={2023},
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment