first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/blip/blip-base_8xb32_caption.py
+++ b/configs/blip/blip-base_8xb32_caption.py
+_base_ = [
+    '../_base_/datasets/coco_caption.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipCaption',
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    decoder_head=dict(
+        type='SeqGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+    ),
+    prompt='a picture of ',
+    max_txt_len=20,
+)
+# schedule settings
+optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+train_cfg = dict(max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/blip/blip-base_8xb32_caption_flickr30k.py
+++ b/configs/blip/blip-base_8xb32_caption_flickr30k.py
+_base_ = [
+    '../_base_/datasets/flickr30k_caption.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipCaption',
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    decoder_head=dict(
+        type='SeqGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+    ),
+    prompt='a picture of ',
+    max_txt_len=20,
+)
+# schedule settings
+optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+train_cfg = dict(max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/blip/blip-base_8xb32_nlvr.py
+++ b/configs/blip/blip-base_8xb32_nlvr.py
+_base_ = [
+    '../_base_/datasets/nlvr2.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipNLVR',
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    multimodal_backbone=dict(
+        type='BertModel',
+        config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True,
+            nlvr=True),
+        add_pooling_layer=False),
+)
+# optimizer
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.05)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
+default_hooks = dict(logger=dict(interval=1))
--- a/configs/blip/blip-base_8xb32_nocaps.py
+++ b/configs/blip/blip-base_8xb32_nocaps.py
+_base_ = [
+    '../_base_/datasets/nocaps.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipCaption',
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    decoder_head=dict(
+        type='SeqGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+    ),
+    prompt='a picture of ',
+    max_txt_len=20,
+)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/blip/blip-base_8xb32_ocrvqa.py
+++ b/configs/blip/blip-base_8xb32_ocrvqa.py
+_base_ = [
+    '../_base_/datasets/ocrvqa.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipVQA',
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=480,
+        patch_size=16,
+        out_type='raw'),
+    multimodal_backbone=dict(
+        type='XBertEncoder',
+        med_config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True),
+    ),
+    head=dict(
+        type='VQAGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+        inference_method='generate',
+    ),
+)
+# schedule settings
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.05)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+train_cfg = dict(max_epochs=10, by_epoch=True)
+val_cfg = dict()
+test_cfg = dict()
+# runtime settings
+randomness = dict(seed=42)
--- a/configs/blip/blip-base_8xb32_okvqa.py
+++ b/configs/blip/blip-base_8xb32_okvqa.py
+_base_ = [
+    '../_base_/datasets/coco_okvqa.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipVQA',
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=480,
+        patch_size=16,
+        out_type='raw'),
+    multimodal_backbone=dict(
+        type='XBertEncoder',
+        med_config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True),
+    ),
+    head=dict(
+        type='VQAGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+        inference_method='generate',
+    ),
+)
+# schedule settings
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.05)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+train_cfg = dict(max_epochs=10, by_epoch=True)
+val_cfg = dict()
+test_cfg = dict()
+# runtime settings
+randomness = dict(seed=42)
--- a/configs/blip/blip-base_8xb32_retrieval.py
+++ b/configs/blip/blip-base_8xb32_retrieval.py
+_base_ = [
+    '../_base_/datasets/coco_retrieval.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipRetrieval',
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    text_backbone=dict(
+        type='XBertEncoder',
+        med_config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True),
+    ),
+    vision_neck=dict(
+        type='Linear',
+        in_features=768,
+        out_features=256,
+    ),
+    text_neck=dict(
+        type='Linear',
+        in_features=768,
+        out_features=256,
+    ),
+    head=dict(
+        type='ITCHead',
+        embed_dim=256,
+    ),
+    multimodal_head=dict(
+        type='ITMHead',
+        hidden_size=768,
+        with_pooler=False,
+    ),
+    topk=256,
+    max_txt_len=35,
+)
+# optimizer
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.04)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning rate scheduler
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6)
+val_cfg = dict(type='RetrievalValLoop')
+test_cfg = dict(type='RetrievalTestLoop')
+randomness = dict(seed=42)
+default_hooks = dict(logger=dict(interval=1))
+custom_hooks = [
+    dict(
+        type='WarmupParamHook',
+        param_name='alpha',
+        module_name='head',
+        warmup_epochs=2)
+]
--- a/configs/blip/blip-base_8xb32_retrieval_flickr30k.py
+++ b/configs/blip/blip-base_8xb32_retrieval_flickr30k.py
+_base_ = [
+    '../_base_/datasets/flickr30k_retrieval.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipRetrieval',
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        out_type='raw',
+    ),
+    text_backbone=dict(
+        type='XBertEncoder',
+        med_config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True),
+    ),
+    vision_neck=dict(
+        type='Linear',
+        in_features=768,
+        out_features=256,
+    ),
+    text_neck=dict(
+        type='Linear',
+        in_features=768,
+        out_features=256,
+    ),
+    head=dict(
+        type='ITCHead',
+        embed_dim=256,
+    ),
+    multimodal_head=dict(
+        type='ITMHead',
+        hidden_size=768,
+        with_pooler=False,
+    ),
+    topk=256,
+    max_txt_len=35,
+)
+# optimizer
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.04)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning rate scheduler
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6)
+val_cfg = dict(type='RetrievalValLoop')
+test_cfg = dict(type='RetrievalTestLoop')
+randomness = dict(seed=42)
+default_hooks = dict(logger=dict(interval=1))
+custom_hooks = [
+    dict(
+        type='WarmupParamHook',
+        param_name='alpha',
+        module_name='head',
+        warmup_epochs=2)
+]
--- a/configs/blip/blip-base_8xb32_vqa.py
+++ b/configs/blip/blip-base_8xb32_vqa.py
+_base_ = [
+    '../_base_/datasets/coco_vg_vqa.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='BlipVQA',
+    tokenizer=dict(type='BlipTokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='VisionTransformer',
+        arch='b',
+        img_size=480,
+        patch_size=16,
+        out_type='raw'),
+    multimodal_backbone=dict(
+        type='XBertEncoder',
+        med_config=dict(
+            architectures=['BertModel'],
+            attention_probs_dropout_prob=0.1,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            hidden_size=768,
+            initializer_range=0.02,
+            intermediate_size=3072,
+            layer_norm_eps=1e-12,
+            max_position_embeddings=512,
+            model_type='bert',
+            num_attention_heads=12,
+            num_hidden_layers=12,
+            pad_token_id=0,
+            add_type_embeddings=False,
+            vocab_size=30524,
+            encoder_width=768,
+            add_cross_attention=True),
+    ),
+    head=dict(
+        type='VQAGenerationHead',
+        decoder=dict(
+            type='XBertLMHeadDecoder',
+            med_config=dict(
+                architectures=['BertModel'],
+                attention_probs_dropout_prob=0.1,
+                hidden_act='gelu',
+                hidden_dropout_prob=0.1,
+                hidden_size=768,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                layer_norm_eps=1e-12,
+                max_position_embeddings=512,
+                model_type='bert',
+                num_attention_heads=12,
+                num_hidden_layers=12,
+                pad_token_id=0,
+                add_type_embeddings=False,
+                vocab_size=30524,
+                encoder_width=768,
+                add_cross_attention=True),
+        ),
+        inference_method='rank',  # or 'generate'
+        answer_list_path=
+        'https://storage.googleapis.com/sfr-vision-language-research/datasets/answer_list.json',  # noqa: E501
+    ),
+)
+# schedule settings
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.05)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+train_cfg = dict(max_epochs=10, by_epoch=True)
+test_cfg = dict()
+# runtime settings
+randomness = dict(seed=42)
--- a/configs/blip/metafile.yml
+++ b/configs/blip/metafile.yml
+Collections:
+  - Name: BLIP
+    Metadata:
+      Training Data:
+        - COCO
+        - VG
+        - Conceptual Captions
+        - Conceptual 12M
+        - SBU captions
+      Architecture:
+        - Transformer
+      Training Resources: 8x A100 GPUs
+    Paper:
+      Title: 'BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language
+        Understanding and Generation'
+      URL: https://arxiv.org/abs/2201.12086
+    README: configs/blip/README.md
+Models:
+  - Name: blip-base_8xb16_refcoco
+    Metadata:
+      FLOPs: null
+      Parameters: 498488636
+    In Collection: BLIP
+    Results:
+      - Task: Visual Grounding
+        Dataset: RefCOCO
+        Metrics:
+          Accuracy (testA): 86.14
+          Accuracy (testB): 77.33
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip/blip-base_8xb16_refcoco_20230508-d2d10f4c.pth
+    Config: configs/blip/blip-base_8xb16_refcoco.py
+  - Name: blip-base_3rdparty_caption
+    Metadata:
+      FLOPs: null
+      Parameters: 223971644
+    In Collection: BLIP
+    Results:
+      - Dataset: COCO
+        Task: Image Caption
+        Metrics:
+          BLEU-4: 40.12
+          CIDER: 132.82
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip/blip-base_3rdparty_coco-caption_20230419-a5b71af3.pth
+    Config: configs/blip/blip-base_8xb32_caption.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth
+      Code: https://github.com/salesforce/LAVIS
+  - Name: blip-base_3rdparty_nlvr
+    Metadata:
+      FLOPs: null
+      Parameters: 259372034
+    In Collection: BLIP
+    Results:
+      - Task: NLVR
+        Dataset: NLVR2
+        Metrics:
+          Top 1 Accuracy: 82.33
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip/blip-base_3rdparty_nlvr_20230427-3b14d33f.pth
+    Config: configs/blip/blip-base_8xb32_nlvr.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth
+      Code: https://github.com/salesforce/LAVIS
+  - Name: blip-base_3rdparty_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 361478972
+    In Collection: BLIP
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: 78.2
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip/blip-base_3rdparty-capflit_vqa_20230505-81488941.pth
+    Config: configs/blip/blip-base_8xb32_vqa.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth
+      Code: https://github.com/salesforce/LAVIS
+  - Name: blip-base_3rdparty_retrieval
+    Metadata:
+      FLOPs: null
+      Parameters: 447486979
+    In Collection: BLIP
+    Results:
+      - Task: Image-To-Text Retrieval
+        Dataset: COCO
+        Metrics:
+          Recall@1: 82.52
+          Recall@5: 95.34
+      - Task: Text-To-Image Retrieval
+        Dataset: COCO
+        Metrics:
+          Recall@1: 64.82
+          Recall@5: 86.28
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip/blip-base_3rdparty_coco-retrieval_20230419-a1804d2c.pth
+    Config: configs/blip/blip-base_8xb32_retrieval.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth
+      Code: https://github.com/salesforce/LAVIS
--- a/configs/blip2/README.md
+++ b/configs/blip2/README.md
+# BLIP-2
+> [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](http://arxiv.org/abs/2301.12597)
+<!-- [ALGORITHM] -->
+## Abstract
+The cost of vision-and-language pre-training has become increasingly prohibitive due to end-toend training of large-scale models. This paper proposes BLIP-2, a generic and efficient pretraining strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pretrained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various visionlanguage tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model’s emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30762564/236385045-dc22a621-0a9c-4352-afa4-ca3888044850.png" width="70%"/>
+</div>
+## How to use it?
+<!-- [TABS-BEGIN] -->
+**Use the model**
+```python
+from mmpretrain import inference_model
+result = inference_model('blip2-opt2.7b_3rdparty-zeroshot_caption', 'demo/cat-dog.png')
+print(result)
+# {'pred_caption': 'a dog and a cat sitting on a blanket'}
+```
+**Test Command**
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+Test:
+```shell
+python tools/test.py configs/blip2/blip2_8xb32_retrieval.py https://download.openmmlab.com/mmclassification/v1/blip2/blip2_3rdparty_pretrain_20230505-f7ef4390.pth
+```
+<!-- [TABS-END] -->
+## Models and results
+### Image Caption on COCO
+| Model                                       | Params (M) | BLEU-4 | CIDER  |                   Config                   |                                           Download                                            |
+| :------------------------------------------ | :--------: | :----: | :----: | :----------------------------------------: | :-------------------------------------------------------------------------------------------: |
+| `blip2-opt2.7b_3rdparty-zeroshot_caption`\* |  3770.47   | 32.90  | 111.10 | [config](./blip2-opt2.7b_8xb32_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/blip2/blip2-opt2.7b_3rdparty_pretrain_20230505-b51db4e1.pth) |
+### Visual Question Answering on VQAv2
+| Model                                   | Params (M) | Accuracy |                 Config                 |                                                 Download                                                  |
+| :-------------------------------------- | :--------: | :------: | :------------------------------------: | :-------------------------------------------------------------------------------------------------------: |
+| `blip2-opt2.7b_3rdparty-zeroshot_vqa`\* |  3770.47   |  53.50   | [config](./blip2-opt2.7b_8xb16_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/blip2/blip2-opt2.7b_3rdparty_pretrain_20230505-b51db4e1.pth) |
+### Image-To-Text Retrieval on COCO
+| Model                        | Params (M) | Recall@1 |                Config                |                                                    Download                                                     |
+| :--------------------------- | :--------: | :------: | :----------------------------------: | :-------------------------------------------------------------------------------------------------------------: |
+| `blip2_3rdparty_retrieval`\* |  1173.19   |  85.40   | [config](./blip2_8xb32_retrieval.py) | [model](https://download.openmmlab.com/mmclassification/v1/blip2/blip2_3rdparty_pretrain_20230505-f7ef4390.pth) |
+*Models with * are converted from the [official repo](https://github.com/salesforce/LAVIS). The config files of these models are only for inference. We haven't reproduce the training results.*
+## Citation
+```bibtex
+@article{beitv2,
+    title={Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models},
+    author={Li, Junnan and Li, Dongxu and Savarese, Silvio and Hoi, Steven},
+    year={2023},
+    eprint={2301.12597},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
--- a/configs/blip2/blip2-opt2.7b_8xb16_gqa.py
+++ b/configs/blip2/blip2-opt2.7b_8xb16_gqa.py
+_base_ = [
+    '../_base_/datasets/gqa.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Blip2VQA',
+    tokenizer=dict(
+        type='AutoTokenizer', name_or_path='facebook/opt-2.7b',
+        use_fast=False),
+    vision_backbone=dict(
+        type='BEiTViT',
+        # eva-g without the final layer
+        arch=dict(
+            embed_dims=1408,
+            num_layers=39,
+            num_heads=16,
+            feedforward_channels=6144,
+        ),
+        img_size=364,
+        patch_size=14,
+        out_indices=-2,
+        layer_scale_init_value=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        frozen_stages=39,
+        final_norm=False,
+        use_shared_rel_pos_bias=False,
+        out_type='raw'),
+    text_backbone=dict(
+        type='OPTForCausalLM', name_or_path='facebook/opt-2.7b'),
+    multimodal_backbone=dict(
+        type='Qformer',
+        model_style='bert-base-uncased',
+        vision_model_width=1408,
+        add_cross_attention=True,
+        cross_attention_freq=2,
+        num_query_token=32),
+    vision_neck=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=2560,
+    ),
+    prompt='Question: {} Short Answer:',
+    max_txt_len=10)
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224),
+    dict(type='PackInputs', algorithm_keys=['question', 'gt_answer']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(224, 224),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='CleanCaption',
+        keys=['question'],
+    ),
+    dict(type='PackInputs', algorithm_keys=['question', 'gt_answer']),
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# schedule settings
+optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+train_cfg = dict(max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/blip2/blip2-opt2.7b_8xb16_vqa.py
+++ b/configs/blip2/blip2-opt2.7b_8xb16_vqa.py
+_base_ = [
+    '../_base_/datasets/coco_vqa.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Blip2VQA',
+    tokenizer=dict(
+        type='AutoTokenizer', name_or_path='facebook/opt-2.7b',
+        use_fast=False),
+    vision_backbone=dict(
+        type='BEiTViT',
+        # eva-g without the final layer
+        arch=dict(
+            embed_dims=1408,
+            num_layers=39,
+            num_heads=16,
+            feedforward_channels=6144,
+        ),
+        img_size=364,
+        patch_size=14,
+        out_indices=-2,
+        layer_scale_init_value=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        frozen_stages=39,
+        final_norm=False,
+        use_shared_rel_pos_bias=False,
+        out_type='raw'),
+    text_backbone=dict(
+        type='OPTForCausalLM', name_or_path='facebook/opt-2.7b'),
+    multimodal_backbone=dict(
+        type='Qformer',
+        model_style='bert-base-uncased',
+        vision_model_width=1408,
+        add_cross_attention=True,
+        cross_attention_freq=2,
+        num_query_token=32),
+    vision_neck=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=2560,
+    ),
+    prompt='Question: {} Answer:',
+    max_txt_len=10)
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(224, 224),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(
+        type='CleanCaption',
+        keys=['question'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# schedule settings
+optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+train_cfg = dict(max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/blip2/blip2-opt2.7b_8xb32_caption.py
+++ b/configs/blip2/blip2-opt2.7b_8xb32_caption.py
+_base_ = [
+    '../_base_/datasets/coco_caption.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Blip2Caption',
+    tokenizer=dict(
+        type='AutoTokenizer', name_or_path='facebook/opt-2.7b',
+        use_fast=False),
+    vision_backbone=dict(
+        type='BEiTViT',
+        # eva-g without the final layer
+        arch=dict(
+            embed_dims=1408,
+            num_layers=39,
+            num_heads=16,
+            feedforward_channels=6144,
+        ),
+        img_size=364,
+        patch_size=14,
+        out_indices=-2,
+        layer_scale_init_value=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        frozen_stages=39,
+        final_norm=False,
+        use_shared_rel_pos_bias=False,
+        out_type='raw'),
+    text_backbone=dict(
+        type='OPTForCausalLM', name_or_path='facebook/opt-2.7b'),
+    multimodal_backbone=dict(
+        type='Qformer',
+        model_style='bert-base-uncased',
+        vision_model_width=1408,
+        add_cross_attention=True,
+        cross_attention_freq=2,
+        num_query_token=32),
+    vision_neck=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=2560,
+    ),
+    prompt='a photo of',
+    max_txt_len=30)
+# schedule settings
+optim_wrapper = dict(optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.05))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=10,
+    )
+]
+train_cfg = dict(by_epoch=True, max_epochs=10)
+val_cfg = dict()
+test_cfg = dict()
+# dataset settings
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(364, 364),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='PackInputs', meta_keys=['image_id']),
+]
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
--- a/configs/blip2/blip2_8xb32_retrieval.py
+++ b/configs/blip2/blip2_8xb32_retrieval.py
+_base_ = [
+    '../_base_/datasets/coco_retrieval.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Blip2Retrieval',
+    tokenizer=dict(type='Blip2Tokenizer', name_or_path='bert-base-uncased'),
+    vision_backbone=dict(
+        type='BEiTViT',
+        # eva-g without the final layer
+        arch=dict(
+            embed_dims=1408,
+            num_layers=39,
+            num_heads=16,
+            feedforward_channels=6144,
+        ),
+        img_size=364,
+        patch_size=14,
+        layer_scale_init_value=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        final_norm=False,
+        use_shared_rel_pos_bias=False,
+        out_type='raw'),
+    multimodal_backbone=dict(
+        type='Qformer',
+        model_style='bert-base-uncased',
+        vision_model_width=1408,
+        add_cross_attention=True,
+        cross_attention_freq=2,
+        num_query_token=32),
+    vision_neck=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=256,
+    ),
+    text_neck=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=256,
+    ),
+    multimodal_head=dict(
+        type='ITMHead',
+        hidden_size=768,
+        with_pooler=False,
+    ),
+    topk=128,
+    max_txt_len=35,
+)
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(364, 364),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CleanCaption', keys='text'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text', 'gt_text_id', 'gt_image_id'],
+        meta_keys=['image_id']),
+]
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# optimizer
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.04)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning rate scheduler
+param_scheduler = [dict(type='CosineAnnealingLR', by_epoch=True)]
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6)
+val_cfg = dict(type='RetrievalValLoop')
+test_cfg = dict(type='RetrievalTestLoop')
+randomness = dict(seed=42)
--- a/configs/blip2/metafile.yml
+++ b/configs/blip2/metafile.yml
+Collections:
+  - Name: BLIP-2
+    Metadata:
+      Training Data:
+        - COCO
+        - VG
+        - CC3M
+        - CC12M
+        - SBU
+        - LAION-400M
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Transformer
+        - Q-Former
+    Paper:
+      Title: 'BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image
+        Encoders and Large Language Models'
+      URL: https://arxiv.org/abs/2301.12597
+    README: configs/blip2/README.md
+Models:
+  - Name: blip2_3rdparty_retrieval
+    Metadata:
+      FLOPs: null
+      Parameters: 1173191358
+    In Collection: BLIP-2
+    Results:
+      - Task: Image-To-Text Retrieval
+        Dataset: COCO
+        Metrics:
+          Recall@1: 85.4
+      - Task: Text-To-Image Retrieval
+        Dataset: COCO
+        Metrics:
+          Recall@1: 68.3
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip2/blip2_3rdparty_pretrain_20230505-f7ef4390.pth
+    Config: configs/blip2/blip2_8xb32_retrieval.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth
+      Code: https://github.com/salesforce/LAVIS
+  - Name: blip2-opt2.7b_3rdparty-zeroshot_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 3770465152
+    In Collection: BLIP-2
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: 53.5
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip2/blip2-opt2.7b_3rdparty_pretrain_20230505-b51db4e1.pth
+    Config: configs/blip2/blip2-opt2.7b_8xb16_vqa.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth
+      Code: https://github.com/salesforce/LAVIS
+  - Name: blip2-opt2.7b_3rdparty-zeroshot_caption
+    Metadata:
+      FLOPs: null
+      Parameters: 3770465152
+    In Collection: BLIP-2
+    Results:
+      - Task: Image Caption
+        Dataset: COCO
+        Metrics:
+          BLEU-4: 32.90
+          CIDER: 111.10
+    Weights: https://download.openmmlab.com/mmclassification/v1/blip2/blip2-opt2.7b_3rdparty_pretrain_20230505-b51db4e1.pth
+    Config: configs/blip2/blip2-opt2.7b_8xb32_caption.py
+    Converted From:
+      Weights: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth
+      Code: https://github.com/salesforce/LAVIS
--- a/configs/byol/README.md
+++ b/configs/byol/README.md
+# BYOL
+> [Bootstrap your own latent: A new approach to self-supervised Learning](https://arxiv.org/abs/2006.07733)
+<!-- [ALGORITHM] -->
+## Abstract
+**B**ootstrap **Y**our **O**wn **L**atent (BYOL) is a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the same time, we update the target network with a slow-moving average of the online network.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36138628/149720208-5ffbee78-1437-44c7-9ddb-b8caab60d2c3.png" width="800" />
+</div>
+## How to use it?
+<!-- [TABS-BEGIN] -->
+**Predict image**
+```python
+from mmpretrain import inference_model
+predict = inference_model('resnet50_byol-pre_8xb512-linear-coslr-90e_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+**Use the model**
+```python
+import torch
+from mmpretrain import get_model
+model = get_model('byol_resnet50_16xb256-coslr-200e_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+**Train/Test Command**
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+Train:
+```shell
+python tools/train.py configs/byol/byol_resnet50_16xb256-coslr-200e_in1k.py
+```
+Test:
+```shell
+python tools/test.py configs/byol/benchmarks/resnet50_8xb512-linear-coslr-90e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220825-7596c6f5.pth
+```
+<!-- [TABS-END] -->
+## Models and results
+### Pretrained models
+| Model                                   | Params (M) | Flops (G) |                       Config                       |                                           Download                                           |
+| :-------------------------------------- | :--------: | :-------: | :------------------------------------------------: | :------------------------------------------------------------------------------------------: |
+| `byol_resnet50_16xb256-coslr-200e_in1k` |   68.02    |   4.11    | [config](byol_resnet50_16xb256-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220825-de817331.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220825-de817331.json) |
+### Image Classification on ImageNet-1k
+| Model                                     |                   Pretrain                   | Params (M) | Flops (G) | Top-1 (%) |                   Config                   |                   Download                    |
+| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
+| `resnet50_byol-pre_8xb512-linear-coslr-90e_in1k` | [BYOL](https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220825-de817331.pth) |   25.56    |   4.11    |   71.80   | [config](benchmarks/resnet50_8xb512-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220825-7596c6f5.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220825-7596c6f5.json) |
+## Citation
+```bibtex
+@inproceedings{grill2020bootstrap,
+  title={Bootstrap your own latent: A new approach to self-supervised learning},
+  author={Grill, Jean-Bastien and Strub, Florian and Altch{\'e}, Florent and Tallec, Corentin and Richemond, Pierre H and Buchatskaya, Elena and Doersch, Carl and Pires, Bernardo Avila and Guo, Zhaohan Daniel and Azar, Mohammad Gheshlaghi and others},
+  booktitle={NeurIPS},
+  year={2020}
+}
+```
--- a/configs/byol/benchmarks/mask-rcnn_r50-c4_ms-1x_coco.py
+++ b/configs/byol/benchmarks/mask-rcnn_r50-c4_ms-1x_coco.py
+_base_ = 'mmdet::mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py'
+# https://github.com/open-mmlab/mmdetection/blob/dev-3.x/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_mask=True,
+    pad_size_divisor=32)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        frozen_stages=-1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    roi_head=dict(
+        shared_head=dict(
+            type='ResLayerExtraNorm',
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch')))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+custom_imports = dict(
+    imports=['mmpretrain.models.utils.res_layer_extra_norm'],
+    allow_failed_imports=False)
--- a/configs/byol/benchmarks/mask-rcnn_r50_fpn_ms-1x_coco.py
+++ b/configs/byol/benchmarks/mask-rcnn_r50_fpn_ms-1x_coco.py
+_base_ = 'mmdet::mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+# https://github.com/open-mmlab/mmdetection/blob/dev-3.x/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=False),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(type='Shared4Conv1FCBBoxHead', norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
--- a/configs/byol/benchmarks/resnet50_8xb512-linear-coslr-90e_in1k.py
+++ b/configs/byol/benchmarks/resnet50_8xb512-linear-coslr-90e_in1k.py
+_base_ = [
+    '../../_base_/models/resnet50.py',
+    '../../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../../_base_/schedules/imagenet_lars_coslr_90e.py',
+    '../../_base_/default_runtime.py',
+]
+model = dict(
+    backbone=dict(
+        frozen_stages=4,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
+# dataset summary
+train_dataloader = dict(batch_size=512)
+# runtime settings
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))