first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/_base_/datasets/imagenet_bs64_swin_384.py
+++ b/configs/_base_/datasets/imagenet_bs64_swin_384.py
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=384, backend='pillow', interpolation='bicubic'),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/imagenet_bs64_t2t_224.py
+++ b/configs/_base_/datasets/imagenet_bs64_t2t_224.py
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=248,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/imagenet_bs8_pil_bicubic_320.py
+++ b/configs/_base_/datasets/imagenet_bs8_pil_bicubic_320.py
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[122.5, 122.5, 122.5],
+    std=[122.5, 122.5, 122.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=320,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=int(320 / 224 * 256),
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=320),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        split='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/inshop_bs32_448.py
+++ b/configs/_base_/datasets/inshop_bs32_448.py
+# dataset settings
+dataset_type = 'InShop'
+data_preprocessor = dict(
+    num_classes=3997,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=512),
+    dict(type='RandomCrop', crop_size=448),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=512),
+    dict(type='CenterCrop', crop_size=448),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/inshop',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+query_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/inshop',
+        split='query',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+gallery_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/inshop',
+        split='gallery',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_dataloader = query_dataloader
+val_evaluator = [
+    dict(type='RetrievalRecall', topk=1),
+    dict(type='RetrievalAveragePrecision', topk=10),
+]
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/nlvr2.py
+++ b/configs/_base_/datasets/nlvr2.py
+# dataset settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(
+        type='ApplyToList',
+        # NLVR requires to load two images in task.
+        scatter_key='img_path',
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='RandomResizedCrop',
+                scale=384,
+                interpolation='bicubic',
+                backend='pillow'),
+            dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+        ],
+        collate_keys=['img', 'scale_factor', 'ori_shape'],
+    ),
+    dict(type='CleanCaption', keys='text'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text'],
+        meta_keys=['image_id'],
+    ),
+]
+test_pipeline = [
+    dict(
+        type='ApplyToList',
+        # NLVR requires to load two images in task.
+        scatter_key='img_path',
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='Resize',
+                scale=(384, 384),
+                interpolation='bicubic',
+                backend='pillow'),
+        ],
+        collate_keys=['img', 'scale_factor', 'ori_shape'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text'],
+        meta_keys=['image_id'],
+    ),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='NLVR2',
+        data_root='data/nlvr2',
+        ann_file='dev.json',
+        data_prefix='dev',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+    drop_last=True,
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=8,
+    dataset=dict(
+        type='NLVR2',
+        data_root='data/nlvr2',
+        ann_file='dev.json',
+        data_prefix='dev',
+        pipeline=test_pipeline,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='Accuracy')
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/nocaps.py
+++ b/configs/_base_/datasets/nocaps.py
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(384, 384),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='PackInputs', meta_keys=['image_id']),
+]
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type='NoCaps',
+        data_root='data/nocaps/',
+        data_prefix=dict(img_path='images/'),
+        ann_file='annotations/nocaps_val_4500_captions.json',
+        pipeline=test_pipeline,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(
+    type='NocapsSave',
+    save_dir='./',
+)
+# # If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/ocrvqa.py
+++ b/configs/_base_/datasets/ocrvqa.py
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CleanCaption', keys=['question', 'gt_answer']),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=[],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(480, 480),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CleanCaption', keys=['question', 'gt_answer']),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=[],
+    ),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='OCRVQA',
+        data_root='data/ocrvqa',
+        data_prefix='images',
+        ann_file='annotations/dataset.json',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+    drop_last=True,
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=8,
+    dataset=dict(
+        type='OCRVQA',
+        data_root='data/ocrvqa',
+        data_prefix='images',
+        ann_file='annotations/dataset.json',
+        split='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VQAAcc')
+test_dataloader = dict(
+    batch_size=64,
+    num_workers=8,
+    dataset=dict(
+        type='OCRVQA',
+        data_root='data/ocrvqa',
+        data_prefix='images',
+        ann_file='annotations/dataset.json',
+        split='test',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+test_evaluator = dict(type='VQAAcc')
--- a/configs/_base_/datasets/pipelines/auto_aug.py
+++ b/configs/_base_/datasets/pipelines/auto_aug.py
+# Policy for ImageNet, refers to
+# https://github.com/DeepVoltaire/AutoAugment/blame/master/autoaugment.py
+policy_imagenet = [
+    [
+        dict(type='Posterize', bits=4, prob=0.4),
+        dict(type='Rotate', angle=30., prob=0.6)
+    ],
+    [
+        dict(type='Solarize', thr=256 / 9 * 4, prob=0.6),
+        dict(type='AutoContrast', prob=0.6)
+    ],
+    [dict(type='Equalize', prob=0.8),
+     dict(type='Equalize', prob=0.6)],
+    [
+        dict(type='Posterize', bits=5, prob=0.6),
+        dict(type='Posterize', bits=5, prob=0.6)
+    ],
+    [
+        dict(type='Equalize', prob=0.4),
+        dict(type='Solarize', thr=256 / 9 * 5, prob=0.2)
+    ],
+    [
+        dict(type='Equalize', prob=0.4),
+        dict(type='Rotate', angle=30 / 9 * 8, prob=0.8)
+    ],
+    [
+        dict(type='Solarize', thr=256 / 9 * 6, prob=0.6),
+        dict(type='Equalize', prob=0.6)
+    ],
+    [dict(type='Posterize', bits=6, prob=0.8),
+     dict(type='Equalize', prob=1.)],
+    [
+        dict(type='Rotate', angle=10., prob=0.2),
+        dict(type='Solarize', thr=256 / 9, prob=0.6)
+    ],
+    [
+        dict(type='Equalize', prob=0.6),
+        dict(type='Posterize', bits=5, prob=0.4)
+    ],
+    [
+        dict(type='Rotate', angle=30 / 9 * 8, prob=0.8),
+        dict(type='ColorTransform', magnitude=0., prob=0.4)
+    ],
+    [
+        dict(type='Rotate', angle=30., prob=0.4),
+        dict(type='Equalize', prob=0.6)
+    ],
+    [dict(type='Equalize', prob=0.0),
+     dict(type='Equalize', prob=0.8)],
+    [dict(type='Invert', prob=0.6),
+     dict(type='Equalize', prob=1.)],
+    [
+        dict(type='ColorTransform', magnitude=0.4, prob=0.6),
+        dict(type='Contrast', magnitude=0.8, prob=1.)
+    ],
+    [
+        dict(type='Rotate', angle=30 / 9 * 8, prob=0.8),
+        dict(type='ColorTransform', magnitude=0.2, prob=1.)
+    ],
+    [
+        dict(type='ColorTransform', magnitude=0.8, prob=0.8),
+        dict(type='Solarize', thr=256 / 9 * 2, prob=0.8)
+    ],
+    [
+        dict(type='Sharpness', magnitude=0.7, prob=0.4),
+        dict(type='Invert', prob=0.6)
+    ],
+    [
+        dict(
+            type='Shear',
+            magnitude=0.3 / 9 * 5,
+            prob=0.6,
+            direction='horizontal'),
+        dict(type='Equalize', prob=1.)
+    ],
+    [
+        dict(type='ColorTransform', magnitude=0., prob=0.4),
+        dict(type='Equalize', prob=0.6)
+    ],
+    [
+        dict(type='Equalize', prob=0.4),
+        dict(type='Solarize', thr=256 / 9 * 5, prob=0.2)
+    ],
+    [
+        dict(type='Solarize', thr=256 / 9 * 4, prob=0.6),
+        dict(type='AutoContrast', prob=0.6)
+    ],
+    [dict(type='Invert', prob=0.6),
+     dict(type='Equalize', prob=1.)],
+    [
+        dict(type='ColorTransform', magnitude=0.4, prob=0.6),
+        dict(type='Contrast', magnitude=0.8, prob=1.)
+    ],
+    [dict(type='Equalize', prob=0.8),
+     dict(type='Equalize', prob=0.6)],
+]
--- a/configs/_base_/datasets/pipelines/rand_aug.py
+++ b/configs/_base_/datasets/pipelines/rand_aug.py
+# Refers to `_RAND_INCREASING_TRANSFORMS` in pytorch-image-models
+rand_increasing_policies = [
+    dict(type='AutoContrast'),
+    dict(type='Equalize'),
+    dict(type='Invert'),
+    dict(type='Rotate', magnitude_key='angle', magnitude_range=(0, 30)),
+    dict(type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)),
+    dict(type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)),
+    dict(
+        type='SolarizeAdd',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 110)),
+    dict(
+        type='ColorTransform',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 0.9)),
+    dict(type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)),
+    dict(
+        type='Brightness', magnitude_key='magnitude',
+        magnitude_range=(0, 0.9)),
+    dict(
+        type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)),
+    dict(
+        type='Shear',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 0.3),
+        direction='horizontal'),
+    dict(
+        type='Shear',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 0.3),
+        direction='vertical'),
+    dict(
+        type='Translate',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 0.45),
+        direction='horizontal'),
+    dict(
+        type='Translate',
+        magnitude_key='magnitude',
+        magnitude_range=(0, 0.45),
+        direction='vertical')
+]
--- a/configs/_base_/datasets/refcoco.py
+++ b/configs/_base_/datasets/refcoco.py
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.4,
+                hue=0.1,
+                backend='cv2')
+        ],
+        prob=0.5),
+    dict(
+        type='mmdet.RandomCrop',
+        crop_type='relative_range',
+        crop_size=(0.8, 0.8),
+        allow_negative_crop=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(384, 384), (360, 360), (344, 344), (312, 312), (300, 300),
+                (286, 286), (270, 270)],
+        keep_ratio=False),
+    dict(
+        type='RandomTranslatePad',
+        size=384,
+        aug_translate=True,
+    ),
+    dict(type='CleanCaption', keys='text'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text', 'gt_bboxes', 'scale_factor'],
+        meta_keys=['image_id'],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(384, 384),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CleanCaption', keys='text'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text', 'gt_bboxes', 'scale_factor'],
+        meta_keys=['image_id'],
+    ),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='RefCOCO',
+        data_root='data/coco',
+        data_prefix='train2014',
+        ann_file='refcoco/instances.json',
+        split_file='refcoco/refs(unc).p',
+        split='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    drop_last=True,
+)
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='RefCOCO',
+        data_root='data/coco',
+        data_prefix='train2014',
+        ann_file='refcoco/instances.json',
+        split_file='refcoco/refs(unc).p',
+        split='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='VisualGroundingMetric')
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='RefCOCO',
+        data_root='data/coco',
+        data_prefix='train2014',
+        ann_file='refcoco/instances.json',
+        split_file='refcoco/refs(unc).p',
+        split='testA',  # or 'testB'
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/tiny_imagenet_bs32.py
+++ b/configs/_base_/datasets/tiny_imagenet_bs32.py
+# dataset settings
+dataset_type = 'CustomDataset'
+data_preprocessor = dict(
+    num_classes=200,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=256, edge='short'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/tiny_imagenet_bs32_pil_resize.py
+++ b/configs/_base_/datasets/tiny_imagenet_bs32_pil_resize.py
+# dataset settings
+dataset_type = 'CustomDataset'
+data_preprocessor = dict(
+    num_classes=200,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/tiny_imagenet_bs64_pil_resize_autoaug.py
+++ b/configs/_base_/datasets/tiny_imagenet_bs64_pil_resize_autoaug.py
+# dataset settings
+dataset_type = 'CustomDataset'
+data_preprocessor = dict(
+    num_classes=200,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='AutoAugment',
+        policies='imagenet',
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=256,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/tiny_imagenet_bs64_swin_224.py
+++ b/configs/_base_/datasets/tiny_imagenet_bs64_swin_224.py
+# dataset settings
+dataset_type = 'CustomDataset'
+data_preprocessor = dict(
+    num_classes=200,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=256,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/vizwiz.py
+++ b/configs/_base_/datasets/vizwiz.py
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(480, 480),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='CleanCaption',
+        keys=['question'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VizWiz',
+        data_root='data/vizwiz/Images',
+        data_prefix='',
+        ann_file='Annotations/train.json',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+    drop_last=True,
+)
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VizWiz',
+        data_root='data/vizwiz/Images',
+        data_prefix='',
+        ann_file='Annotations/val.json',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VizWizAcc')
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VizWiz',
+        data_root='data/vizwiz/Images',
+        data_prefix='',
+        ann_file='Annotations/test.json',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+test_evaluator = dict(type='ReportVQA', file_path='vqa_test.json')
--- a/configs/_base_/datasets/voc_bs16.py
+++ b/configs/_base_/datasets/voc_bs16.py
+# dataset settings
+dataset_type = 'VOC'
+data_preprocessor = dict(
+    num_classes=20,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+    # generate onehot-format labels for multi-label classification.
+    to_onehot=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=256, edge='short'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(
+        type='PackInputs',
+        # `gt_label_difficult` is needed for VOC evaluation
+        meta_keys=('sample_idx', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction',
+                   'gt_label_difficult')),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/VOC2007',
+        split='trainval',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/VOC2007',
+        split='test',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+test_dataloader = val_dataloader
+# calculate precision_recall_f1 and mAP
+val_evaluator = [
+    dict(type='VOCMultiLabelMetric'),
+    dict(type='VOCMultiLabelMetric', average='micro'),
+    dict(type='VOCAveragePrecision')
+]
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/vsr.py
+++ b/configs/_base_/datasets/vsr.py
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(480, 480),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='CleanCaption',
+        keys=['question'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VSR',
+        data_root='data/coco',
+        data_prefix='',
+        ann_file='annotations/train.json',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+    drop_last=True,
+)
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VSR',
+        data_root='data/coco',
+        data_prefix='',
+        ann_file='annotations/val.json',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VSRAcc')
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='VSR',
+        data_root='data/coco',
+        data_prefix='',
+        ann_file='annotations/test.json',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+test_evaluator = val_evaluator
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
+# defaults to use registries in mmpretrain
+default_scope = 'mmpretrain'
+# configure default hooks
+default_hooks = dict(
+    # record the time of every iteration.
+    timer=dict(type='IterTimerHook'),
+    # print log every 100 iterations.
+    logger=dict(type='LoggerHook', interval=100),
+    # enable the parameter scheduler.
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    # save checkpoint per epoch.
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    # set sampler seed in distributed evrionment.
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    # validation results visualization, set True to enable it.
+    visualization=dict(type='VisualizationHook', enable=False),
+)
+# configure environment
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    # set distributed parameters
+    dist_cfg=dict(backend='nccl'),
+)
+# set visualizer
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='UniversalVisualizer', vis_backends=vis_backends)
+# set log level
+log_level = 'INFO'
+# load from which checkpoint
+load_from = None
+# whether to resume training from the loaded checkpoint
+resume = False
+# Defaults to use random seed and disable `deterministic`
+randomness = dict(seed=None, deterministic=False)
--- a/configs/_base_/models/conformer/base-p16.py
+++ b/configs/_base_/models/conformer/base-p16.py
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer', arch='base', drop_path_rate=0.1, init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[1536, 576],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]),
+)
--- a/configs/_base_/models/conformer/small-p16.py
+++ b/configs/_base_/models/conformer/small-p16.py
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer', arch='small', drop_path_rate=0.1, init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[1024, 384],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]),
+)