clip_vit-large-p14_zeroshot-cls_cifar100.py 1.65 KB
Newer Older
renzhc's avatar
renzhc committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
_base_ = '../_base_/default_runtime.py'

# data settings
data_preprocessor = dict(
    type='MultiModalDataPreprocessor',
    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
    to_rgb=False,
)

test_pipeline = [
    dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
    dict(
        type='PackInputs',
        algorithm_keys=['text'],
        meta_keys=['image_id', 'scale_factor'],
    ),
]

train_dataloader = None
test_dataloader = dict(
    batch_size=32,
    num_workers=8,
    dataset=dict(
        type='CIFAR100',
        data_root='data/cifar100',
        split='test',
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = dict(type='Accuracy', topk=(1, 5))

# schedule settings
train_cfg = None
val_cfg = None
test_cfg = dict()

# model settings
model = dict(
    type='CLIPZeroShot',
    vision_backbone=dict(
        type='VisionTransformer',
        arch='large',
        img_size=224,
        patch_size=14,
        drop_rate=0.,
        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
        pre_norm=True,
    ),
    projection=dict(type='CLIPProjection', in_channels=1024, out_channels=768),
    text_backbone=dict(
        type='CLIPTransformer',
        width=768,
        layers=12,
        heads=12,
        attn_mask=True,
    ),
    tokenizer=dict(
        type='AutoTokenizer',
        name_or_path='openai/clip-vit-large-patch14',
        use_fast=False),
    vocab_size=49408,
    transformer_width=768,
    proj_dim=768,
    text_prototype='cifar100',
    text_prompt='openai_cifar100',
    context_length=77,
)