Initial commit

e9cee049 · luopl · e9cee049 · e9cee049 · e9cee049 · e9cee049
Commit e9cee049 authored May 31, 2024 by luopl
20 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools","wheel","torch"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "yolo_world"
+version = "0.1.0"
+description = "YOLO-World: Real-time Open Vocabulary Object Detection"
+readme = "README.md"
+keywords = ["object detection"]
+authors = [
+    { name = "Tencent AILab", email = "ronnysong@tencent.com" },
+]
+license = {text = "Apache License 2.0"}
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+requires-python = ">= 3.7"
+
+dependencies = [
+    "wheel",
+    "torch>=2.1.0",
+    "torchvision>=0.16.2",
+    "transformers",
+    "tokenizers",
+    "numpy",
+    "opencv-python",
+    "supervision==0.19.0",
+    "openmim",
+    "mmcv-lite>=2.0.0rc4",
+    "mmdet>=3.0.0",
+    "mmengine>=0.7.1",
+    "mmcv",
+    'mmyolo @ git+https://github.com/onuralpszr/mmyolo.git',
+
+]
+
+[tool.setuptools]
+package-dir = {"yolo_world" = "yolo_world"}
+include-package-data = false
+license-files = ["LICENSE"]
+zip-safe = true
+
+[tool.setuptools.packages.find]
+include = ["yolo_world*"]
+exclude = ["docs*", "tests*","third_party*","assets*"]
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+addict==2.4.0
+aiofiles==23.2.1
+albumentations==1.3.0
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.1.1
+cycler==0.12.1
+Cython==3.0.10
+defusedxml==0.7.1
+dnspython==2.6.1
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.3
+ffmpy==0.3.2
+filelock==3.14.0
+fonttools==4.51.0
+fsspec==2024.3.1
+gradio==4.16.0
+gradio_client==0.8.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.0
+idna==3.7
+imageio==2.34.1
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lazy_loader==0.4
+lvis==0.5.3
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.5
+mdurl==0.1.2
+mmdet==3.0.0
+mmengine==0.10.3
+# Editable install with no version control (mmyolo==0.6.0)
+networkx==3.1
+numpy==1.23.5
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
+orjson==3.10.3
+packaging==24.0
+pandas==2.0.3
+pillow==10.3.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.2.1
+prettytable==3.10.0
+pycocotools==2.0.7
+pydantic==2.7.1
+pydantic_core==2.18.2
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyWavelets==1.4.1
+PyYAML==6.0.1
+qudida==0.0.4
+referencing==0.35.1
+regex==2024.4.28
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.3
+safetensors==0.4.3
+scikit-image==0.21.0
+scikit-learn==1.3.2
+scipy==1.10.0
+semantic-version==2.10.0
+shapely==2.0.4
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.37.2
+supervision==0.20.0
+termcolor==2.4.0
+terminaltables==3.1.10
+threadpoolctl==3.5.0
+tifffile==2023.7.10
+timm==0.6.13
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+tqdm==4.66.4
+transformers==4.36.2
+typer==0.12.3
+typing_extensions==4.11.0
+tzdata==2024.1
+ujson==5.9.0
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+wcwidth==0.2.13
+websockets==11.0.3
+yapf==0.40.2
+zipp==3.18.1
--- a/requirements/basic_requirements.txt
+++ b/requirements/basic_requirements.txt
+opencv-python==4.9.0.80
+opencv-python-headless==4.2.0.34
+mmcv==2.0.0
+mmdet==3.0.0
+mmengine==0.10.3
+mmyolo==0.6.0
+timm==0.6.13
+transformers==4.36.2
+albumentations
\ No newline at end of file
--- a/requirements/demo_requirements.txt
+++ b/requirements/demo_requirements.txt
+gradio==4.16.0
+supervision
\ No newline at end of file
--- a/requirements/onnx_requirements.txt
+++ b/requirements/onnx_requirements.txt
+supervision
+onnx
+onnxruntime
+onnxsim
\ No newline at end of file
--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${MASTER_PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
--- a/tools/generate_image_prompts.py
+++ b/tools/generate_image_prompts.py
+import os
+import tqdm
+import argparse
+import os.path as osp
+import numpy as np
+from PIL import Image
+from transformers import (AutoTokenizer, AutoProcessor,
+                          CLIPVisionModelWithProjection,
+                          CLIPTextModelWithProjection)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='../pretrained_models/open-ai-clip-vit-base-patch32')
+    parser.add_argument('--image-dir', type=str, default='data/samples.txt')
+    parser.add_argument('--out-dir', type=str, default='')
+    parser.add_argument('--out-file', type=str)
+
+    args = parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model)
+    text_model = CLIPTextModelWithProjection.from_pretrained(args.model)
+    processor = AutoProcessor.from_pretrained(args.model)
+
+    # padding prompts
+    device = 'cuda:0'
+    text_model.to(device)
+    texts = tokenizer(text=[' '], return_tensors='pt', padding=True)
+    texts = texts.to(device)
+    text_outputs = text_model(**texts)
+    txt_feats = text_outputs.text_embeds
+    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
+    txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy()
+
+    images = os.listdir(args.image_dir)
+    category_embeds = []
+
+    def _forward_vision_model(image_name):
+        image_path = osp.join(args.image_dir, image_name)
+        # category = image_name.split('-')[1]
+        image = Image.open(image_path).convert("RGB")
+        inputs = processor(images=image, return_tensors="pt", padding=True)
+        image_outputs = vision_model(**inputs)
+        img_feats = image_outputs.image_embeds
+        # img_feats
+        img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
+        img_feats = img_feats.reshape(
+            -1, img_feats.shape[-1])[0].cpu().data.numpy()
+        category_embeds.append(img_feats)
+
+    for image_ in tqdm.tqdm(images):
+        _forward_vision_model(image_)
+    category_embeds.append(txt_feats)
+    category_embeds = np.stack(category_embeds)
+    np.save(osp.join(args.out_dir, args.out_file), category_embeds)
--- a/tools/generate_text_prompts.py
+++ b/tools/generate_text_prompts.py
+import json
+import argparse
+import numpy as np
+from transformers import (AutoTokenizer, CLIPTextModelWithProjection)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='./pretrained_models/clip-vit-base-patch32-projection')
+    parser.add_argument('--text',
+                        type=str,
+                        default='data/captions/coco_class_captions.json')
+    parser.add_argument('--out', type=str, default='output.npy')
+
+    args = parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = CLIPTextModelWithProjection.from_pretrained(args.model)
+
+    with open(args.text) as f:
+        data = json.load(f)
+    texts = [x[0] for x in data]
+    device = 'cuda:0'
+    model.to(device)
+    texts = tokenizer(text=texts, return_tensors='pt', padding=True)
+    texts = texts.to(device)
+    text_outputs = model(**texts)
+    txt_feats = text_outputs.text_embeds
+    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
+    txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1])
+
+    np.save(args.out, txt_feats.cpu().data.numpy())
--- a/tools/reparameterize_yoloworld.py
+++ b/tools/reparameterize_yoloworld.py
+import os
+import argparse
+
+import torch
+import numpy as np
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser("Reparameterize YOLO-World")
+    parser.add_argument('--model', help='model checkpoints to reparameterize')
+    parser.add_argument('--out-dir', help='output checkpoints')
+    parser.add_argument(
+        '--text-embed',
+        help='text embeddings to reparameterized into YOLO-World')
+    parser.add_argument('--conv-neck',
+                        action='store_true',
+                        help='whether using 1x1 conv in RepVL-PAN')
+
+    args = parser.parse_args()
+    return args
+
+
+def convert_head(scale, bias, text_embed):
+    N, D = text_embed.shape
+    weight = (text_embed * scale.exp()).view(N, D, 1, 1)
+    bias = torch.ones(N) * bias
+    return weight, bias
+
+
+def reparameterize_head(state_dict, embeds):
+
+    cls_layers = [
+        'bbox_head.head_module.cls_contrasts.0',
+        'bbox_head.head_module.cls_contrasts.1',
+        'bbox_head.head_module.cls_contrasts.2'
+    ]
+
+    for i in range(3):
+        scale = state_dict[cls_layers[i] + '.logit_scale']
+        bias = state_dict[cls_layers[i] + '.bias']
+        weight, bias = convert_head(scale, bias, embeds)
+        state_dict[cls_layers[i] + '.conv.weight'] = weight
+        state_dict[cls_layers[i] + '.conv.bias'] = bias
+        del state_dict[cls_layers[i] + '.bias']
+        del state_dict[cls_layers[i] + '.logit_scale']
+    return state_dict
+
+
+def convert_neck_split_conv(input_state_dict, block_name, text_embeds,
+                            num_heads):
+    if block_name + '.guide_fc.weight' not in input_state_dict:
+        return input_state_dict
+    guide_fc_weight = input_state_dict[block_name + '.guide_fc.weight']
+    guide_fc_bias = input_state_dict[block_name + '.guide_fc.bias']
+    guide = text_embeds @ guide_fc_weight.transpose(0,
+                                                    1) + guide_fc_bias[None, :]
+    N, D = guide.shape
+    guide = list(guide.split(D // num_heads, dim=1))
+    del input_state_dict[block_name + '.guide_fc.weight']
+    del input_state_dict[block_name + '.guide_fc.bias']
+    for i in range(num_heads):
+        input_state_dict[block_name +
+                         f'.guide_convs.{i}.weight'] = guide[i][:, :, None,
+                                                                None]
+    return input_state_dict
+
+
+def convert_neck_weight(input_state_dict, block_name, embeds, num_heads):
+    guide_fc_weight = input_state_dict[block_name + '.guide_fc.weight']
+    guide_fc_bias = input_state_dict[block_name + '.guide_fc.bias']
+    guide = embeds @ guide_fc_weight.transpose(0, 1) + guide_fc_bias[None, :]
+    N, D = guide.shape
+    del input_state_dict[block_name + '.guide_fc.weight']
+    del input_state_dict[block_name + '.guide_fc.bias']
+    input_state_dict[block_name + '.guide_weight'] = guide.view(
+        N, D // num_heads, num_heads)
+    return input_state_dict
+
+
+def reparameterize_neck(state_dict, embeds, type='conv'):
+
+    neck_blocks = [
+        'neck.top_down_layers.0.attn_block',
+        'neck.top_down_layers.1.attn_block',
+        'neck.bottom_up_layers.0.attn_block',
+        'neck.bottom_up_layers.1.attn_block'
+    ]
+    if "neck.top_down_layers.0.attn_block.bias" not in state_dict:
+        return state_dict
+    for block in neck_blocks:
+        num_heads = state_dict[block + '.bias'].shape[0]
+        if type == 'conv':
+            convert_neck_split_conv(state_dict, block, embeds, num_heads)
+        else:
+            convert_neck_weight(state_dict, block, embeds, num_heads)
+    return state_dict
+
+
+def main():
+
+    args = parse_args()
+
+    # load checkpoint
+    model = torch.load(args.model, map_location='cpu')
+    state_dict = model['state_dict']
+
+    # load embeddings
+    embeddings = torch.from_numpy(np.load(args.text_embed))
+
+    # remove text encoder
+    keys = list(state_dict.keys())
+    keys = [x for x in keys if "text_model" not in x]
+
+    state_dict_wo_text = {x: state_dict[x] for x in keys}
+    print("removing text encoder")
+
+    state_dict_wo_text = reparameterize_head(state_dict_wo_text, embeddings)
+    print("reparameterizing head")
+
+    if args.conv_neck:
+        neck_type = "conv"
+    else:
+        neck_type = "linear"
+
+    state_dict_wo_text = reparameterize_neck(state_dict_wo_text, embeddings,
+                                             neck_type)
+
+    print("reparameterizing neck")
+
+    model['state_dict'] = state_dict_wo_text
+
+    model_name = os.path.basename(args.model)
+    model_name = model_name.replace('.pth', f'_rep_{neck_type}.pth')
+    torch.save(model, os.path.join(args.out_dir, model_name))
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/test.py
+++ b/tools/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmdet.engine.hooks.utils import trigger_visualization_hook
+from mmengine.config import Config, ConfigDict, DictAction
+from mmengine.evaluator import DumpResults
+from mmengine.runner import Runner
+
+from mmyolo.registry import RUNNERS
+from mmyolo.utils import is_metainfo_lower
+
+
+# TODO: support fuse_conv_bn
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMYOLO test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--out',
+        type=str,
+        help='output result file (must be a .pkl file) in pickle format')
+    parser.add_argument(
+        '--json-prefix',
+        type=str,
+        help='the prefix of the output json file without perform evaluation, '
+        'which is useful when you want to format the result to a specific '
+        'format and submit it to the test server')
+    parser.add_argument(
+        '--tta',
+        action='store_true',
+        help='Whether to use test time augmentation')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--deploy',
+        action='store_true',
+        help='Switch model to deployment mode')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    # replace the ${key} with the value of cfg.key
+    # cfg = replace_cfg_vals(cfg)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.deploy:
+        cfg.custom_hooks.append(dict(type='SwitchToDeployHook'))
+
+    # add `format_only` and `outfile_prefix` into cfg
+    if args.json_prefix is not None:
+        cfg_json = {
+            'test_evaluator.format_only': True,
+            'test_evaluator.outfile_prefix': args.json_prefix
+        }
+        cfg.merge_from_dict(cfg_json)
+
+    # Determine whether the custom metainfo fields are all lowercase
+    is_metainfo_lower(cfg)
+
+    if args.tta:
+        assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' \
+                                   " Can't use tta !"
+        assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` ' \
+                                      "in config. Can't use tta !"
+
+        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
+        test_data_cfg = cfg.test_dataloader.dataset
+        while 'dataset' in test_data_cfg:
+            test_data_cfg = test_data_cfg['dataset']
+
+        # batch_shapes_cfg will force control the size of the output image,
+        # it is not compatible with tta.
+        if 'batch_shapes_cfg' in test_data_cfg:
+            test_data_cfg.batch_shapes_cfg = None
+        test_data_cfg.pipeline = cfg.tta_pipeline
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # add `DumpResults` dummy metric
+    if args.out is not None:
+        assert args.out.endswith(('.pkl', '.pickle')), \
+            'The dump file must be a pkl file.'
+        runner.test_evaluator.metrics.append(
+            DumpResults(out_file_path=args.out))
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/train.py
+++ b/tools/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import logging
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.logging import print_log
+from mmengine.runner import Runner
+
+from mmyolo.registry import RUNNERS
+from mmyolo.utils import is_metainfo_lower
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpoint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    # replace the ${key} with the value of cfg.key
+    # cfg = replace_cfg_vals(cfg)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        if args.config.startswith('projects/'):
+            config = args.config[len('projects/'):]
+            config = config.replace('/configs/', '/')
+            cfg.work_dir = osp.join('./work_dirs', osp.splitext(config)[0])
+        else:
+            cfg.work_dir = osp.join('./work_dirs',
+                                    osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.type
+        if optim_wrapper == 'AmpOptimWrapper':
+            print_log(
+                'AMP training is already enabled in your config.',
+                logger='current',
+                level=logging.WARNING)
+        else:
+            assert optim_wrapper == 'OptimWrapper', (
+                '`--amp` is only supported when the optimizer wrapper type is '
+                f'`OptimWrapper` but got {optim_wrapper}.')
+            cfg.optim_wrapper.type = 'AmpOptimWrapper'
+            cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # resume is determined in this priority: resume from > auto_resume
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+
+    # Determine whether the custom metainfo fields are all lowercase
+    is_metainfo_lower(cfg)
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
--- a/yolo_world/__init__.py
+++ b/yolo_world/__init__.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import importlib.metadata as importlib_metadata
+
+try:
+    __version__ = importlib_metadata.version(__package__ or __name__)
+except importlib_metadata.PackageNotFoundError:
+    __version__ = '0.0.0'
+
+
+from .models import *  # noqa
+from .datasets import *  # noqa
+from .engine import *  # noqa
--- a/yolo_world/datasets/__init__.py
+++ b/yolo_world/datasets/__init__.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from .mm_dataset import (
+    MultiModalDataset, MultiModalMixedDataset)
+from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset
+from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset
+from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset
+from .utils import yolow_collate
+from .transformers import *  # NOQA
+from .yolov5_v3det import YOLOv5V3DetDataset
+from .yolov5_lvis import YOLOv5LVISV1Dataset
+
+__all__ = [
+    'MultiModalDataset', 'YOLOv5Objects365V1Dataset',
+    'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset',
+    'YOLOv5V3DetDataset', 'yolow_collate',
+    'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset',
+]
--- a/yolo_world/datasets/mm_dataset.py
+++ b/yolo_world/datasets/mm_dataset.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import copy
+import json
+import logging
+from typing import Callable, List, Union
+
+from mmengine.logging import print_log
+from mmengine.dataset.base_dataset import (
+        BaseDataset, Compose, force_full_init)
+from mmyolo.registry import DATASETS
+
+
+@DATASETS.register_module()
+class MultiModalDataset:
+    """Multi-modal dataset."""
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 class_text_path: str = None,
+                 test_mode: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 lazy_init: bool = False) -> None:
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'dataset must be a dict or a BaseDataset, '
+                f'but got {dataset}')
+
+        if class_text_path is not None:
+            self.class_texts = json.load(open(class_text_path, 'r'))
+            # ori_classes = self.dataset.metainfo['classes']
+            # assert len(ori_classes) == len(self.class_texts), \
+            #     ('The number of classes in the dataset and the class text'
+            #      'file must be the same.')
+        else:
+            self.class_texts = None
+
+        self.test_mode = test_mode
+        self._metainfo = self.dataset.metainfo
+        self.pipeline = Compose(pipeline)
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self) -> None:
+        """``full_init`` dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = self.dataset.get_data_info(idx)
+        if self.class_texts is not None:
+            data_info.update({'texts': self.class_texts})
+        return data_info
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to '
+                'accelerate the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        data_info = self.get_data_info(idx)
+
+        if hasattr(self.dataset, 'test_mode') and not self.dataset.test_mode:
+            data_info['dataset'] = self
+        elif not self.test_mode:
+            data_info['dataset'] = self
+        return self.pipeline(data_info)
+
+    @force_full_init
+    def __len__(self) -> int:
+        return self._ori_len
+
+
+@DATASETS.register_module()
+class MultiModalMixedDataset(MultiModalDataset):
+    """Multi-modal Mixed dataset.
+    mix "detection dataset" and "caption dataset"
+    Args:
+        dataset_type (str): dataset type, 'detection' or 'caption'
+    """
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 class_text_path: str = None,
+                 dataset_type: str = 'detection',
+                 test_mode: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 lazy_init: bool = False) -> None:
+        self.dataset_type = dataset_type
+        super().__init__(dataset,
+                         class_text_path,
+                         test_mode,
+                         pipeline,
+                         lazy_init)
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = self.dataset.get_data_info(idx)
+        if self.class_texts is not None:
+            data_info.update({'texts': self.class_texts})
+        data_info['is_detection'] = 1 \
+            if self.dataset_type == 'detection' else 0
+        return data_info
--- a/yolo_world/datasets/transformers/__init__.py
+++ b/yolo_world/datasets/transformers/__init__.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from .mm_transforms import RandomLoadText, LoadText
+from .mm_mix_img_transforms import (
+    MultiModalMosaic, MultiModalMosaic9, YOLOv5MultiModalMixUp,
+    YOLOXMultiModalMixUp)
+
+__all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic',
+           'MultiModalMosaic9', 'YOLOv5MultiModalMixUp',
+           'YOLOXMultiModalMixUp']
--- a/yolo_world/datasets/transformers/mm_mix_img_transforms.py
+++ b/yolo_world/datasets/transformers/mm_mix_img_transforms.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import collections
+import copy
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmdet.structures.bbox import autocast_box_type
+from mmengine.dataset import BaseDataset
+from mmengine.dataset.base_dataset import Compose
+from numpy import random
+from mmyolo.registry import TRANSFORMS
+
+
+class BaseMultiModalMixImageTransform(BaseTransform, metaclass=ABCMeta):
+    """A Base Transform of Multimodal multiple images mixed.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup.
+
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image if use_cached is True.
+
+    Args:
+        pre_transform(Sequence[str]): Sequence of transform object or
+            config dict to be composed. Defaults to None.
+        prob(float): The transformation probability. Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 pre_transform: Optional[Sequence[str]] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+
+        self.max_refetch = max_refetch
+        self.prob = prob
+
+        self.use_cached = use_cached
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.results_cache = []
+
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+
+    @abstractmethod
+    def get_indexes(self, dataset: Union[BaseDataset,
+                                         list]) -> Union[list, int]:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list or int: indexes.
+        """
+        pass
+
+    @abstractmethod
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        pass
+
+    def _update_label_text(self, results: dict) -> dict:
+        """Update label text."""
+        if 'texts' not in results:
+            return results
+
+        mix_texts = sum(
+            [results['texts']] +
+            [x['texts'] for x in results['mix_results']], [])
+        mix_texts = list({tuple(x) for x in mix_texts})
+        text2id = {text: i for i, text in enumerate(mix_texts)}
+
+        for res in [results] + results['mix_results']:
+            for i, label in enumerate(res['gt_bboxes_labels']):
+                text = res['texts'][label]
+                updated_id = text2id[tuple(text)]
+                res['gt_bboxes_labels'][i] = updated_id
+            res['texts'] = mix_texts
+        return results
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Data augmentation function.
+
+        The transform steps are as follows:
+        1. Randomly generate index list of other images.
+        2. Before Mosaic or MixUp need to go through the necessary
+            pre_transform, such as MixUp' pre_transform pipeline
+            include: 'LoadImageFromFile','LoadAnnotations',
+            'Mosaic' and 'RandomAffine'.
+        3. Use mix_img_transform function to implement specific
+            mix operations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if self.use_cached:
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+            self.results_cache.append(copy.deepcopy(results))
+            if len(self.results_cache) > self.max_cached_images:
+                if self.random_pop:
+                    index = random.randint(0, len(self.results_cache) - 1)
+                else:
+                    index = 0
+                self.results_cache.pop(index)
+
+            if len(self.results_cache) <= 4:
+                return results
+        else:
+            assert 'dataset' in results
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+
+        for _ in range(self.max_refetch):
+            # get index of one or three other images
+            if self.use_cached:
+                indexes = self.get_indexes(self.results_cache)
+            else:
+                indexes = self.get_indexes(dataset)
+
+            if not isinstance(indexes, collections.abc.Sequence):
+                indexes = [indexes]
+
+            if self.use_cached:
+                mix_results = [
+                    copy.deepcopy(self.results_cache[i]) for i in indexes
+                ]
+            else:
+                # get images information will be used for Mosaic or MixUp
+                mix_results = [
+                    copy.deepcopy(dataset.get_data_info(index))
+                    for index in indexes
+                ]
+
+            if self.pre_transform is not None:
+                for i, data in enumerate(mix_results):
+                    # pre_transform may also require dataset
+                    data.update({'dataset': dataset})
+                    # before Mosaic or MixUp need to go through
+                    # the necessary pre_transform
+                    _results = self.pre_transform(data)
+                    _results.pop('dataset')
+                    mix_results[i] = _results
+
+            if None not in mix_results:
+                results['mix_results'] = mix_results
+                break
+            print('Repeated calculation')
+        else:
+            raise RuntimeError(
+                'The loading pipeline of the original dataset'
+                ' always return None. Please check the correctness '
+                'of the dataset and its pipeline.')
+
+        # update labels and texts
+        results = self._update_label_text(results)
+
+        # Mosaic or MixUp
+        results = self.mix_img_transform(results)
+
+        if 'mix_results' in results:
+            results.pop('mix_results')
+        results['dataset'] = dataset
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class MultiModalMosaic(BaseMultiModalMixImageTransform):
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |           |
+                |      +-----------+    pad    |
+                |      |           |           |
+                |      |  image1   +-----------+
+                |      |           |           |
+                |      |           |   image2  |
+     center_y   |----+-+-----------+-----------+
+                |    |   cropped   |           |
+                |pad |   image3    |   image4  |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                           f'but got {max_cached_images}.'
+
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        # print("use mosaic")
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+        # print("with_mask: ", with_mask)
+        # self.img_scale is wh format
+        img_scale_w, img_scale_h = self.img_scale
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 2), int(img_scale_w * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w)
+        center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h)
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = results
+            else:
+                results_patch = results['mix_results'][i - 1]
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+                results['gt_masks'] = mosaic_masks
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds]
+                results['gt_masks'] = mosaic_masks
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+
+        return results
+
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MultiModalMosaic9(BaseMultiModalMixImageTransform):
+    """Mosaic9 augmentation.
+
+    Given 9 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                +-------------------------------+------------+
+                | pad           |      pad      |            |
+                |    +----------+               |            |
+                |    |          +---------------+  top_right |
+                |    |          |      top      |   image2   |
+                |    | top_left |     image1    |            |
+                |    |  image8  o--------+------+--------+---+
+                |    |          |        |               |   |
+                +----+----------+        |     right     |pad|
+                |               | center |     image3    |   |
+                |     left      | image0 +---------------+---|
+                |    image7     |        |               |   |
+            +---+-----------+---+--------+               |   |
+            |   |  cropped  |            |  bottom_right |pad|
+            |   |bottom_left|            |    image4     |   |
+            |   |  image6   |   bottom   |               |   |
+            +---|-----------+   image5   +---------------+---|
+                |    pad    |            |        pad        |
+                +-----------+------------+-------------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Get the center image according to the index, and randomly
+            sample another 8 images from the custom dataset.
+         2. Randomly offset the image after Mosaic
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 5 caches for each image suffices for
+            randomness. Defaults to 50.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 bbox_clip_border: bool = True,
+                 pad_val: Union[float, int] = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 50,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 9, 'The length of cache must >= 9, ' \
+                                           f'but got {max_cached_images}.'
+
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+
+        self.img_scale = img_scale
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+
+        # intermediate variables
+        self._current_img_shape = [0, 0]
+        self._center_img_shape = [0, 0]
+        self._previous_img_shape = [0, 0]
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(8)]
+        return indexes
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+
+        img_scale_w, img_scale_h = self.img_scale
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 3), int(img_scale_w * 3), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+
+        # index = 0 is mean original image
+        # len(results['mix_results']) = 8
+        loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right',
+                    'bottom', 'bottom_left', 'left', 'top_left')
+
+        results_all = [results, *results['mix_results']]
+        for index, results_patch in enumerate(results_all):
+            img_i = results_patch['img']
+            # keep_ratio resize
+            img_i_h, img_i_w = img_i.shape[:2]
+            scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w)
+            img_i = mmcv.imresize(
+                img_i,
+                (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i)))
+
+            paste_coord = self._mosaic_combine(loc_strs[index],
+                                               img_i.shape[:2])
+
+            padw, padh = paste_coord[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in paste_coord)
+            mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:]
+
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+
+        # Offset
+        offset_x = int(random.uniform(0, img_scale_w))
+        offset_y = int(random.uniform(0, img_scale_h))
+        mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h,
+                                offset_x:offset_x + 2 * img_scale_w]
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes.translate_([-offset_x, -offset_y])
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+
+    def _mosaic_combine(self, loc: str,
+                        img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]:
+        """Calculate global coordinate of mosaic image.
+
+        Args:
+            loc (str): Index for the sub-image.
+            img_shape_hw (Sequence[int]): Height and width of sub-image
+
+        Returns:
+             paste_coord (tuple): paste corner coordinate in mosaic image.
+        """
+        assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right',
+                       'bottom', 'bottom_left', 'left', 'top_left')
+
+        img_scale_w, img_scale_h = self.img_scale
+
+        self._current_img_shape = img_shape_hw
+        current_img_h, current_img_w = self._current_img_shape
+        previous_img_h, previous_img_w = self._previous_img_shape
+        center_img_h, center_img_w = self._center_img_shape
+
+        if loc == 'center':
+            self._center_img_shape = self._current_img_shape
+            #  xmin, ymin, xmax, ymax
+            paste_coord = img_scale_w, \
+                img_scale_h, \
+                img_scale_w + current_img_w, \
+                img_scale_h + current_img_h
+        elif loc == 'top':
+            paste_coord = img_scale_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'top_right':
+            paste_coord = img_scale_w + previous_img_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + previous_img_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + current_img_h
+        elif loc == 'bottom_right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h + previous_img_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + previous_img_h + current_img_h
+        elif loc == 'bottom':
+            paste_coord = img_scale_w + center_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'bottom_left':
+            paste_coord = img_scale_w + center_img_w - \
+                          previous_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w - previous_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h
+        elif loc == 'top_left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - \
+                          previous_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h - previous_img_h
+
+        self._previous_img_shape = self._current_img_shape
+        #  xmin, ymin, xmax, ymax
+        return paste_coord
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOv5MultiModalMixUp(BaseMultiModalMixImageTransform):
+    """MixUp data augmentation for YOLOv5.
+
+    .. code:: text
+
+    The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset.
+        2. Randomly obtain the fusion ratio from the beta distribution,
+            then fuse the target
+        of the original image and mixup image through this ratio.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        alpha (float): parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        beta (float):  parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        pre_transform (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+
+    def __init__(self,
+                 alpha: float = 32.0,
+                 beta: float = 32.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.alpha = alpha
+        self.beta = beta
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOv5 MixUp transform function.
+
+        Args:
+            results (dict): Result dict
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+        ori_img = results['img']
+        assert ori_img.shape == retrieve_img.shape
+
+        # Randomly obtain the fusion ratio from the beta distribution,
+        # which is around 0.5
+        ratio = np.random.beta(self.alpha, self.beta)
+        mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio))
+
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if 'gt_masks' in results:
+            assert 'gt_masks' in retrieve_results
+            mixup_gt_masks = results['gt_masks'].cat(
+                [results['gt_masks'], retrieve_results['gt_masks']])
+            results['gt_masks'] = mixup_gt_masks
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class YOLOXMultiModalMixUp(BaseMultiModalMixImageTransform):
+    """MixUp data augmentation for YOLOX.
+
+    .. code:: text
+
+                         mixup transform
+                +---------------+--------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                +---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      +-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+    The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 bbox_clip_border: bool = True,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.img_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.bbox_clip_border = bbox_clip_border
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOX MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones((self.img_scale[1], self.img_scale[0], 3),
+                              dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0],
+                          self.img_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_filp:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+
+        if not self.bbox_clip_border:
+            # remove outside bbox
+            inside_inds = mixup_gt_bboxes.is_inside([target_h,
+                                                     target_w]).numpy()
+            mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+            mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+            mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_refetch={self.max_refetch}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
--- a/yolo_world/datasets/transformers/mm_transforms.py
+++ b/yolo_world/datasets/transformers/mm_transforms.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import json
+import random
+from typing import Tuple
+
+import numpy as np
+from mmyolo.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class RandomLoadText:
+
+    def __init__(self,
+                 text_path: str = None,
+                 prompt_format: str = '{}',
+                 num_neg_samples: Tuple[int, int] = (80, 80),
+                 max_num_samples: int = 80,
+                 padding_to_max: bool = False,
+                 padding_value: str = '') -> None:
+        self.prompt_format = prompt_format
+        self.num_neg_samples = num_neg_samples
+        self.max_num_samples = max_num_samples
+        self.padding_to_max = padding_to_max
+        self.padding_value = padding_value
+        if text_path is not None:
+            with open(text_path, 'r') as f:
+                self.class_texts = json.load(f)
+
+    def __call__(self, results: dict) -> dict:
+        assert 'texts' in results or hasattr(self, 'class_texts'), (
+            'No texts found in results.')
+        class_texts = results.get(
+            'texts',
+            getattr(self, 'class_texts', None))
+
+        num_classes = len(class_texts)
+        if 'gt_labels' in results:
+            gt_label_tag = 'gt_labels'
+        elif 'gt_bboxes_labels' in results:
+            gt_label_tag = 'gt_bboxes_labels'
+        else:
+            raise ValueError('No valid labels found in results.')
+        positive_labels = set(results[gt_label_tag])
+
+        if len(positive_labels) > self.max_num_samples:
+            positive_labels = set(random.sample(list(positive_labels),
+                                  k=self.max_num_samples))
+
+        num_neg_samples = min(
+            min(num_classes, self.max_num_samples) - len(positive_labels),
+            random.randint(*self.num_neg_samples))
+        candidate_neg_labels = []
+        for idx in range(num_classes):
+            if idx not in positive_labels:
+                candidate_neg_labels.append(idx)
+        negative_labels = random.sample(
+            candidate_neg_labels, k=num_neg_samples)
+
+        sampled_labels = list(positive_labels) + list(negative_labels)
+        random.shuffle(sampled_labels)
+
+        label2ids = {label: i for i, label in enumerate(sampled_labels)}
+
+        gt_valid_mask = np.zeros(len(results['gt_bboxes']), dtype=bool)
+        for idx, label in enumerate(results[gt_label_tag]):
+            if label in label2ids:
+                gt_valid_mask[idx] = True
+                results[gt_label_tag][idx] = label2ids[label]
+        results['gt_bboxes'] = results['gt_bboxes'][gt_valid_mask]
+        results[gt_label_tag] = results[gt_label_tag][gt_valid_mask]
+
+        if 'instances' in results:
+            retaged_instances = []
+            for idx, inst in enumerate(results['instances']):
+                label = inst['bbox_label']
+                if label in label2ids:
+                    inst['bbox_label'] = label2ids[label]
+                    retaged_instances.append(inst)
+            results['instances'] = retaged_instances
+
+        texts = []
+        for label in sampled_labels:
+            cls_caps = class_texts[label]
+            assert len(cls_caps) > 0
+            cap_id = random.randrange(len(cls_caps))
+            sel_cls_cap = self.prompt_format.format(cls_caps[cap_id])
+            texts.append(sel_cls_cap)
+
+        if self.padding_to_max:
+            num_valid_labels = len(positive_labels) + len(negative_labels)
+            num_padding = self.max_num_samples - num_valid_labels
+            if num_padding > 0:
+                texts += [self.padding_value] * num_padding
+
+        results['texts'] = texts
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadText:
+
+    def __init__(self,
+                 text_path: str = None,
+                 prompt_format: str = '{}',
+                 multi_prompt_flag: str = '/') -> None:
+        self.prompt_format = prompt_format
+        self.multi_prompt_flag = multi_prompt_flag
+        if text_path is not None:
+            with open(text_path, 'r') as f:
+                self.class_texts = json.load(f)
+
+    def __call__(self, results: dict) -> dict:
+        assert 'texts' in results or hasattr(self, 'class_texts'), (
+            'No texts found in results.')
+        class_texts = results.get(
+            'texts',
+            getattr(self, 'class_texts', None))
+
+        texts = []
+        for idx, cls_caps in enumerate(class_texts):
+            assert len(cls_caps) > 0
+            sel_cls_cap = cls_caps[0]
+            sel_cls_cap = self.prompt_format.format(sel_cls_cap)
+            texts.append(sel_cls_cap)
+
+        results['texts'] = texts
+
+        return results
--- a/yolo_world/datasets/utils.py
+++ b/yolo_world/datasets/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+from mmengine.dataset import COLLATE_FUNCTIONS
+
+
+@COLLATE_FUNCTIONS.register_module()
+def yolow_collate(data_batch: Sequence,
+                  use_ms_training: bool = False) -> dict:
+    """Rewrite collate_fn to get faster training speed.
+
+    Args:
+       data_batch (Sequence): Batch of data.
+       use_ms_training (bool): Whether to use multi-scale training.
+    """
+    batch_imgs = []
+    batch_bboxes_labels = []
+    batch_masks = []
+    for i in range(len(data_batch)):
+        datasamples = data_batch[i]['data_samples']
+        inputs = data_batch[i]['inputs']
+        batch_imgs.append(inputs)
+
+        gt_bboxes = datasamples.gt_instances.bboxes.tensor
+        gt_labels = datasamples.gt_instances.labels
+        if 'masks' in datasamples.gt_instances:
+            masks = datasamples.gt_instances.masks.to(
+                dtype=torch.bool, device=gt_bboxes.device)
+            batch_masks.append(masks)
+        batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
+        bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
+                                  dim=1)
+        batch_bboxes_labels.append(bboxes_labels)
+
+    collated_results = {
+        'data_samples': {
+            'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
+        }
+    }
+    if len(batch_masks) > 0:
+        collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
+
+    if use_ms_training:
+        collated_results['inputs'] = batch_imgs
+    else:
+        collated_results['inputs'] = torch.stack(batch_imgs, 0)
+
+    if hasattr(data_batch[0]['data_samples'], 'texts'):
+        batch_texts = [meta['data_samples'].texts for meta in data_batch]
+        collated_results['data_samples']['texts'] = batch_texts
+
+    if hasattr(data_batch[0]['data_samples'], 'is_detection'):
+        # detection flag
+        batch_detection = [meta['data_samples'].is_detection
+                           for meta in data_batch]
+        collated_results['data_samples']['is_detection'] = torch.tensor(
+            batch_detection)
+
+    return collated_results
--- a/yolo_world/datasets/yolov5_lvis.py
+++ b/yolo_world/datasets/yolov5_lvis.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from mmdet.datasets import LVISV1Dataset
+
+from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
+from mmyolo.registry import DATASETS
+
+
+@DATASETS.register_module()
+class YOLOv5LVISV1Dataset(BatchShapePolicyDataset, LVISV1Dataset):
+    """Dataset for YOLOv5 LVIS Dataset.
+
+    We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
+    See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass