update codes

b12850fe · dengjb · 6515fb96 · b12850fe · b12850fe · b12850fe
Commit b12850fe authored May 29, 2024 by dengjb
20 changed files
--- a/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
+++ b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
--- a/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
+++ b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
+_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
--- a/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
+++ b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+data_root = 'data/d3/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities', 'sent_ids'))
+]
+
+# -------------------------------------------------#
+val_dataset_full = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_full_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+
+val_evaluator_full = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_full_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_pres = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_pres_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_pres = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_pres_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_abs = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_abs_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_abs = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_abs_annotations.json')
+
+# -------------------------------------------------#
+datasets = [val_dataset_full, val_dataset_pres, val_dataset_abs]
+dataset_prefixes = ['FULL', 'PRES', 'ABS']
+metrics = [val_evaluator_full, val_evaluator_pres, val_evaluator_abs]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
+++ b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
--- a/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
+++ b/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k_entities/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
+++ b/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
+work_dir = './work_dirs/grounding_dino_r50_scratch_8xb2_1x_coco'
+data_root = 'datasets/coco_mini/'
+train_anno = 'annotations/instances_train2017.json'
+val_anno = 'annotations/instances_val2017.json'
+train_image_dir = 'images/train2017/'
+val_image_dir = 'images/val2017/'
+test_batch_size = 1
+train_batch_size = 2
+max_epochs = 12
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+lang_model_name = 'bert-base-uncased'
+launcher = 'pytorch'
+load_from = None
+log_level = 'INFO'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+model = dict(
+    as_two_stage=True,
+    backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        init_cfg=dict(checkpoint='torchvision://resnet50', type='Pretrained'),
+        norm_cfg=dict(requires_grad=False, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            1,
+            2,
+            3,
+        ),
+        style='pytorch',
+        type='ResNet'),
+    bbox_head=dict(
+        contrastive_cfg=dict(bias=True, log_scale='auto', max_text_len=256),
+        loss_bbox=dict(loss_weight=5.0, type='L1Loss'),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        loss_iou=dict(loss_weight=2.0, type='GIoULoss'),
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        type='GroundingDINOHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_mask=False,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='DetDataPreprocessor'),
+    decoder=dict(
+        layer_cfg=dict(
+            cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            cross_attn_text_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+            self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)),
+        num_layers=6,
+        post_norm_cfg=None,
+        return_intermediate=True),
+    dn_cfg=dict(
+        box_noise_scale=1.0,
+        group_cfg=dict(dynamic=True, num_dn_queries=100, num_groups=None),
+        label_noise_scale=0.5),
+    encoder=dict(
+        fusion_layer_cfg=dict(
+            embed_dim=1024,
+            init_values=0.0001,
+            l_dim=256,
+            num_heads=4,
+            v_dim=256),
+        layer_cfg=dict(
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+            self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_levels=4)),
+        num_cp=6,
+        num_layers=6,
+        text_layer_cfg=dict(
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0),
+            self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=4))),
+    language_model=dict(
+        add_pooling_layer=False,
+        name='bert-base-uncased',
+        pad_to_max=False,
+        special_tokens_list=[
+            '[CLS]',
+            '[SEP]',
+            '.',
+            '?',
+        ],
+        type='BertModel',
+        use_sub_sentence_represent=True),
+    neck=dict(
+        act_cfg=None,
+        bias=True,
+        in_channels=[
+            512,
+            1024,
+            2048,
+        ],
+        kernel_size=1,
+        norm_cfg=dict(num_groups=32, type='GN'),
+        num_outs=4,
+        out_channels=256,
+        type='ChannelMapper'),
+    num_queries=900,
+    positional_encoding=dict(
+        normalize=True, num_feats=128, offset=0.0, temperature=20),
+    test_cfg=dict(max_per_img=300),
+    train_cfg=dict(
+        assigner=dict(
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(box_format='xywh', type='BBoxL1Cost', weight=5.0),
+                dict(iou_mode='giou', type='IoUCost', weight=2.0),
+            ],
+            type='HungarianAssigner')),
+    type='GroundingDINO',
+    with_box_refine=True)
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            absolute_pos_embed=dict(decay_mult=0.0),
+            backbone=dict(lr_mult=0.1))),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=test_batch_size,
+    dataset=dict(
+        ann_file=val_anno,
+        backend_args=None,
+        data_prefix=dict(img=val_image_dir),
+        data_root=data_root,
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                1333,
+            ), type='FixScaleResize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                    'text',
+                    'custom_entities',
+                ),
+                type='PackDetInputs'),
+        ],
+        return_classes=True,
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=data_root+val_anno,
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        800,
+        1333,
+    ), type='FixScaleResize'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+            'text',
+            'custom_entities',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=train_batch_size,
+    dataset=dict(
+        ann_file=train_anno,
+        backend_args=None,
+        data_prefix=dict(img=train_image_dir),
+        data_root=data_root,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(
+                transforms=[
+                    [
+                        dict(
+                            keep_ratio=True,
+                            scales=[
+                                (
+                                    480,
+                                    1333,
+                                ),
+                                (
+                                    512,
+                                    1333,
+                                ),
+                                (
+                                    544,
+                                    1333,
+                                ),
+                                (
+                                    576,
+                                    1333,
+                                ),
+                                (
+                                    608,
+                                    1333,
+                                ),
+                                (
+                                    640,
+                                    1333,
+                                ),
+                                (
+                                    672,
+                                    1333,
+                                ),
+                                (
+                                    704,
+                                    1333,
+                                ),
+                                (
+                                    736,
+                                    1333,
+                                ),
+                                (
+                                    768,
+                                    1333,
+                                ),
+                                (
+                                    800,
+                                    1333,
+                                ),
+                            ],
+                            type='RandomChoiceResize'),
+                    ],
+                    [
+                        dict(
+                            keep_ratio=True,
+                            scales=[
+                                (
+                                    400,
+                                    4200,
+                                ),
+                                (
+                                    500,
+                                    4200,
+                                ),
+                                (
+                                    600,
+                                    4200,
+                                ),
+                            ],
+                            type='RandomChoiceResize'),
+                        dict(
+                            allow_negative_crop=True,
+                            crop_size=(
+                                384,
+                                600,
+                            ),
+                            crop_type='absolute_range',
+                            type='RandomCrop'),
+                        dict(
+                            keep_ratio=True,
+                            scales=[
+                                (
+                                    480,
+                                    1333,
+                                ),
+                                (
+                                    512,
+                                    1333,
+                                ),
+                                (
+                                    544,
+                                    1333,
+                                ),
+                                (
+                                    576,
+                                    1333,
+                                ),
+                                (
+                                    608,
+                                    1333,
+                                ),
+                                (
+                                    640,
+                                    1333,
+                                ),
+                                (
+                                    672,
+                                    1333,
+                                ),
+                                (
+                                    704,
+                                    1333,
+                                ),
+                                (
+                                    736,
+                                    1333,
+                                ),
+                                (
+                                    768,
+                                    1333,
+                                ),
+                                (
+                                    800,
+                                    1333,
+                                ),
+                            ],
+                            type='RandomChoiceResize'),
+                    ],
+                ],
+                type='RandomChoice'),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                    'flip',
+                    'flip_direction',
+                    'text',
+                    'custom_entities',
+                ),
+                type='PackDetInputs'),
+        ],
+        return_classes=True,
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(
+        transforms=[
+            [
+                dict(
+                    keep_ratio=True,
+                    scales=[
+                        (
+                            480,
+                            1333,
+                        ),
+                        (
+                            512,
+                            1333,
+                        ),
+                        (
+                            544,
+                            1333,
+                        ),
+                        (
+                            576,
+                            1333,
+                        ),
+                        (
+                            608,
+                            1333,
+                        ),
+                        (
+                            640,
+                            1333,
+                        ),
+                        (
+                            672,
+                            1333,
+                        ),
+                        (
+                            704,
+                            1333,
+                        ),
+                        (
+                            736,
+                            1333,
+                        ),
+                        (
+                            768,
+                            1333,
+                        ),
+                        (
+                            800,
+                            1333,
+                        ),
+                    ],
+                    type='RandomChoiceResize'),
+            ],
+            [
+                dict(
+                    keep_ratio=True,
+                    scales=[
+                        (
+                            400,
+                            4200,
+                        ),
+                        (
+                            500,
+                            4200,
+                        ),
+                        (
+                            600,
+                            4200,
+                        ),
+                    ],
+                    type='RandomChoiceResize'),
+                dict(
+                    allow_negative_crop=True,
+                    crop_size=(
+                        384,
+                        600,
+                    ),
+                    crop_type='absolute_range',
+                    type='RandomCrop'),
+                dict(
+                    keep_ratio=True,
+                    scales=[
+                        (
+                            480,
+                            1333,
+                        ),
+                        (
+                            512,
+                            1333,
+                        ),
+                        (
+                            544,
+                            1333,
+                        ),
+                        (
+                            576,
+                            1333,
+                        ),
+                        (
+                            608,
+                            1333,
+                        ),
+                        (
+                            640,
+                            1333,
+                        ),
+                        (
+                            672,
+                            1333,
+                        ),
+                        (
+                            704,
+                            1333,
+                        ),
+                        (
+                            736,
+                            1333,
+                        ),
+                        (
+                            768,
+                            1333,
+                        ),
+                        (
+                            800,
+                            1333,
+                        ),
+                    ],
+                    type='RandomChoiceResize'),
+            ],
+        ],
+        type='RandomChoice'),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+            'flip',
+            'flip_direction',
+            'text',
+            'custom_entities',
+        ),
+        type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=test_batch_size,
+    dataset=dict(
+        ann_file=val_anno,
+        backend_args=None,
+        data_prefix=dict(img=val_image_dir),
+        data_root=data_root,
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                1333,
+            ), type='FixScaleResize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                    'text',
+                    'custom_entities',
+                ),
+                type='PackDetInputs'),
+        ],
+        return_classes=True,
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file=data_root+val_anno,
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+
--- a/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
+++ b/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
+_base_ = [
+    './grounding_dino_swin-t_finetune_16xb2_1x_coco.py',
+]
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'  # noqa
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
--- a/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
+++ b/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
+_base_ = [
+    './grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py',
+]
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
--- a/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
+++ b/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth'  # noqa
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=False,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=False),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        num_cp=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256, log_scale=0.0, bias=False),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=train_pipeline,
+        return_classes=True))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
--- a/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py
+++ b/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py
+_base_ = 'grounding_dino_swin-t_finetune_16xb2_1x_coco.py'
+
+data_root = 'data/cat/'
+class_name = ('cat', )
+num_classes = len(class_name)
+metainfo = dict(classes=class_name, palette=[(220, 20, 60)])
+
+model = dict(bbox_head=dict(num_classes=num_classes))
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        ann_file='annotations/trainval.json',
+        data_prefix=dict(img='images/')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file='annotations/test.json',
+        data_prefix=dict(img='images/')))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
+test_evaluator = val_evaluator
+
+max_epoch = 20
+
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=1, save_best='auto'),
+    logger=dict(type='LoggerHook', interval=5))
+train_cfg = dict(max_epochs=max_epoch, val_interval=1)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=30),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epoch,
+        by_epoch=True,
+        milestones=[15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(lr=0.00005),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0),
+        }))
+
+auto_scale_lr = dict(base_batch_size=16)
--- a/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
+++ b/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=True,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=False),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(max_per_img=300))
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
--- a/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
--- a/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
--- a/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/metafile.yml
+++ b/configs/grounding_dino/metafile.yml
+Collections:
+  - Name: Grounding DINO
+    Metadata:
+      Training Data: Objects365, GoldG, CC3M and COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 3090 GPUs
+      Architecture:
+        - Swin Transformer
+        - BERT
+    Paper:
+      URL: https://arxiv.org/abs/2303.05499
+      Title: 'Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection
+'
+    README: configs/grounding_dino/README.md
+    Code:
+      URL:
+      Version: v3.0.0
+
+Models:
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg_cap4m
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.5
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth
+  - Name: grounding_dino_swin-b_pretrain_mixeddata
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 56.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
+  - Name: grounding_dino_swin-t_finetune_16xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 58.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth
+  - Name: grounding_dino_swin-b_finetune_16xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 59.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco/grounding_dino_swin-b_finetune_16xb2_1x_coco_20230921_153201-f219e0c0.pth
+  - Name: grounding_dino_r50_scratch_8xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco/grounding_dino_r50_scratch_1x_coco-fe0002f2.pth
--- a/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.548; wo. prompt 0.764
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.728; wo. prompt 0.670
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.221; wo. prompt 0.478
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
--- a/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.526, wo. prompt 0.608
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.695; wo. prompt 0.687
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.137; wo. prompt 0.215
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator